Sampling Swire for Coke Heroes

Ian’s ML

April 14, 2024

Set up

Taking a sample of the whole dataset

df <- readRDS("swire_no_nas.rds")  #inject the data and we will sub-sample
regions_joinme <- read.csv("states_summary.csv")

unique(regions_joinme$REGION)
##  [1] "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA" "MOUNTAIN"   
##  [6] "SOCAL"       "ARIZONA"     "NEWMEXICO"   "NOCAL"       "COLORADO"   
## [11] "KANSAS"
# "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA"  "MOUNTAIN"    "SOCAL"   "ARIZONA"    "NEWMEXICO"   "NOCAL"    "COLORADO"    "KANSAS" 

str(regions_joinme)
## 'data.frame':    200 obs. of  2 variables:
##  $ MARKET_KEY: int  13 70 179 197 272 352 32 33 44 50 ...
##  $ REGION    : chr  "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)

Quick imputations

# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
  mutate(
    MONTH = as.numeric(substr(DATE, 6, 7)),  # Extract the month from YYYY-MM-DD format
    SEASON = case_when(
      MONTH %in% c(12, 01, 02) ~ "WINTER",
      MONTH %in% c(03, 04, 05) ~ "SPRING",
      MONTH %in% c(06, 07, 08) ~ "SUMMER",
      MONTH %in% c(09, 10, 11) ~ "FALL",
      TRUE ~ NA_character_  # This is just in case there are any undefined values
    )
  )
str(df)
## 'data.frame':    24461424 obs. of  13 variables:
##  $ MARKET_KEY     : chr  "1" "1" "1" "1" ...
##  $ DATE           : chr  "2021-10-16" "2022-06-04" "2022-02-05" "2022-10-08" ...
##  $ CALORIC_SEGMENT: num  0 0 1 0 0 1 0 0 1 0 ...
##  $ CATEGORY       : chr  "ENERGY" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES     : num  434 28 42 1 26 161 6 5 68 90 ...
##  $ DOLLAR_SALES   : num  924.04 147.77 25.13 0.99 94.56 ...
##  $ MANUFACTURER   : chr  "PONYS" "SWIRE-CC" "COCOS" "JOLLYS" ...
##  $ BRAND          : chr  "MYTHICAL BEVERAGE ULTRA" "DIET PEPPY CF" "HANSENIZZLE'S ECO" "DIET PAPI" ...
##  $ PACKAGE        : chr  "16SMALL MULTI CUP" "12SMALL 12ONE CUP" "12SMALL 6ONE CUP" "12SMALL 6ONE CUP" ...
##  $ ITEM           : chr  "MYTHICAL BEVERAGE ULTRA SUNRISE ENERGY DRINK UNFLAVORED ZERO SUGAR CUP 16 LIQUID SMALL" "DIET PEPPY CAFFEINE FREE GENTLE DRINK RED  PEPPER COLA DIET CUP 12 LIQUID SMALL X12" "HANSENIZZLE'S ECO GENTLE DRINK MANDARIN DURIAN  CUP 12 LIQUID SMALL" "DIET PAPI GENTLE DRINK COLA DIET CUP 12 LIQUID SMALL" ...
##  $ REGION         : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ MONTH          : num  10 6 2 10 7 9 9 6 10 5 ...
##  $ SEASON         : chr  "FALL" "SUMMER" "WINTER" "FALL" ...

Making a 10% sample of the data to shrink it

# Assuming df is your dataframe
set.seed(123) # Set a random seed for reproducibility
sampled_df <- df[sample(1:nrow(df), 2446143), ]
rm(df)
df <- sampled_df
rm(sampled_df)
#skim(df)
summary(df)
##   MARKET_KEY            DATE           CALORIC_SEGMENT    CATEGORY        
##  Length:2446143     Length:2446143     Min.   :0.0000   Length:2446143    
##  Class :character   Class :character   1st Qu.:0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :1.0000   Mode  :character  
##                                        Mean   :0.5025                     
##                                        3rd Qu.:1.0000                     
##                                        Max.   :1.0000                     
##    UNIT_SALES        DOLLAR_SALES      MANUFACTURER          BRAND          
##  Min.   :    0.04   Min.   :     0.0   Length:2446143     Length:2446143    
##  1st Qu.:   11.00   1st Qu.:    36.5   Class :character   Class :character  
##  Median :   40.00   Median :   135.1   Mode  :character   Mode  :character  
##  Mean   :  173.43   Mean   :   587.4                                        
##  3rd Qu.:  126.00   3rd Qu.:   427.4                                        
##  Max.   :91778.00   Max.   :409159.3                                        
##    PACKAGE              ITEM              REGION              MONTH       
##  Length:2446143     Length:2446143     Length:2446143     Min.   : 1.000  
##  Class :character   Class :character   Class :character   1st Qu.: 3.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.000  
##                                                           Mean   : 6.283  
##                                                           3rd Qu.: 9.000  
##                                                           Max.   :12.000  
##     SEASON         
##  Length:2446143    
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Linear model on sampled data looks the same largely

# Perform a linear regression with UNIT_SALES as the dependent variable
# and PRICE (or your chosen variable) as the independent variable
linear_model <- lm(DOLLAR_SALES ~ UNIT_SALES, data = df)

# Print the summary of the linear model to see the results
summary(linear_model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -140089    -117     -68      -3  225329 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 69.056096   1.023439   67.47   <2e-16 ***
## UNIT_SALES   2.989060   0.001201 2489.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1567 on 2446141 degrees of freedom
## Multiple R-squared:  0.717,  Adjusted R-squared:  0.717 
## F-statistic: 6.196e+06 on 1 and 2446141 DF,  p-value: < 2.2e-16
# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(df, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
  geom_point(alpha = 0.5) +  # Adjust alpha to avoid overplotting, if necessary
  geom_smooth(method = "lm", color = "black", se = FALSE) +  # Add linear regression line without confidence band for clarity
  labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
       x = "UNIT SALES",
       y = "DOLLAR SALES") +
  theme_minimal() +
  theme(legend.position = "bottom")  # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'

Taking a look at Diet Smash brand..

# create a table of total values by brand
brand_summary <- df %>%
  group_by(BRAND) %>%
  summarise(
    total_units_sold = sum(UNIT_SALES),
    total_revenue = sum(DOLLAR_SALES),
    avg_price = total_revenue / total_units_sold,
    total_days_sold = n() # Count the number of rows for each brand
  ) %>%
  arrange(desc(total_revenue)) %>%  # Order by revenue in descending order
  mutate(rank = row_number()) 

summary(brand_summary)
##     BRAND           total_units_sold   total_revenue         avg_price      
##  Length:288         Min.   :       1   Min.   :        1   Min.   : 0.5315  
##  Class :character   1st Qu.:    2310   1st Qu.:     7563   1st Qu.: 2.0861  
##  Mode  :character   Median :   94691   Median :   266075   Median : 3.0291  
##                     Mean   : 1473003   Mean   :  4989427   Mean   : 3.2661  
##                     3rd Qu.:  651385   3rd Qu.:  2161764   3rd Qu.: 3.7252  
##                     Max.   :40414038   Max.   :159387186   Max.   :42.9378  
##  total_days_sold         rank       
##  Min.   :     1.0   Min.   :  1.00  
##  1st Qu.:   121.8   1st Qu.: 72.75  
##  Median :  1988.0   Median :144.50  
##  Mean   :  8493.5   Mean   :144.50  
##  3rd Qu.:  8075.8   3rd Qu.:216.25  
##  Max.   :124603.0   Max.   :288.00
print(brand_summary[brand_summary$BRAND == "DIET SMASH", ])
## # A tibble: 1 × 6
##   BRAND      total_units_sold total_revenue avg_price total_days_sold  rank
##   <chr>                 <dbl>         <dbl>     <dbl>           <int> <int>
## 1 DIET SMASH           50496.       210377.      4.17            1763   150

Out of 288 brands, DIET SMASH slides in at 150th place in terms of total revenue at an above average price $4.17 vs overall $3.27.

# Filter the dataframe for only 'DIET SMASH'
filtered_df <- df %>% 
  filter(BRAND == "DIET SMASH")

# Create the plot
ggplot(filtered_df, aes(x = UNIT_SALES, y = DOLLAR_SALES)) +
  geom_point(color = "red", alpha = 1) +  # Bright red points with full opacity
  geom_smooth(method = "lm", color = "black", se = FALSE) +  # Add linear regression line without confidence band
  labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES for DIET SMASH",
       x = "UNIT SALES",
       y = "DOLLAR SALES") +
  theme_minimal() +
  theme(legend.position = "none")  
## `geom_smooth()` using formula = 'y ~ x'

DIET SMASH is not a big seller at base line (our sample only contains abourt 1800 observations). There are 2 distinct trend lines, a high flier group and a low flier group. The high flier group follows the trend line better, while staying mostly above it. The low flier group underperforms the trend line signficantly. The high flier group begins to take an outlier distribution abover 100 unit sales, where dollar sales begin to rapidly outpace the trend line. The low flier group has a much less steep slope, and remains farily tight to about 125 unit sales, but as soon as things hit 150 the dollar sales begin to rise almost verically to meet their peers in the high flier group. Once DIET SMASH hits 150 unit sales or so the dollars start to roll in.

Sales by Week of the Year

filtered_df %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  ggplot(aes(x = WEEK, y = total_sales)) +
  geom_line(color = "black") +  # Blue line connecting points
  labs(title = "Total Sales by Week of the Year",
       x = "Week of the Year",
       y = "Total Unit Sales") +
  theme_minimal()

There are 4 or 5 signficant peaks in sales by week of Diet Smash with a fairly strong clustering between 20 - 35 weeks.

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Calculate total sales for each group of 13 consecutive weeks
sales_by_group <- filtered_df %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
  mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
  arrange(WEEK) %>%  # Order by WEEK
  filter(!is.na(sales_in_group))  # Remove rows with sales_in_group = NA

# Plot the bar chart
sales_by_group$week_label <- factor(sales_by_group$week_label, levels = sales_by_group$week_label[order(sales_by_group$WEEK)])
ggplot(sales_by_group, aes(x = factor(week_label), y = sales_in_group)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Total Sales for Each 13-Week Grouping",
       x = "Weeks (Starting from Week 1)",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

> DIET SMASH has a strong prounced ride up around week 8 and 35, with peak 13 week between week 22 to 34.

#find the best 13 weeks for Kiwano sales
# Calculate total sales for each group of 13 consecutive weeks
sales_by_plum <- df %>%
  filter(str_detect(ITEM, "PLUM")) %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
  mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
  arrange(WEEK) %>%  # Order by WEEK
  filter(!is.na(sales_in_group))  # Remove rows with sales_in_group = NA

# Plot the bar chart
sales_by_plum$week_label <- factor(sales_by_plum$week_label, levels = sales_by_plum$week_label[order(sales_by_plum$WEEK)])
ggplot(sales_by_plum, aes(x = factor(week_label), y = sales_in_group)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Total Sales for Each 13-Week Grouping",
       x = "Weeks (Starting from Week 1)",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

Plum generally trends upward with the strongest sales after week 30, with week 40 to week 52 being the peak 13 week period.

#find the best 13 weeks for plum, ssd, diet, small or one package sales
# Calculate total sales for each group of 13 consecutive weeks
#PLUM flavor does not come in DIET so we will assume that the DIET sales are the same as the REGULAR sales
sales_by_innovation <- df %>%
  filter(CATEGORY == "SSD",
         str_detect(ITEM, "PLUM"),
         str_detect(PACKAGE, "12"),
         str_detect(PACKAGE, 'ONE')) %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
  mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
  arrange(WEEK) %>%  # Order by WEEK
  filter(!is.na(sales_in_group))  # Remove rows with sales_in_group = NA

# Plot the bar chart
sales_by_innovation$week_label <- factor(sales_by_innovation$week_label, levels = sales_by_innovation$week_label[order(sales_by_innovation$WEEK)])
ggplot(sales_by_innovation, aes(x = factor(week_label), y = sales_in_group)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Total Sales for Each 13-Week Grouping",
       x = "Weeks (Starting from Week 1)",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

With Package, Plum, and SSD we sales increaing slighltyly through the year with week 32 though 45 being the best.

Make a new smaller “innovation” data frame

#create innovation based on SSD, Plum, and packages that come in 12 (nearest number to 11) and ONE
innovation<- df %>%
  filter(CATEGORY == "SSD",
         str_detect(ITEM, "PLUM"),
         str_detect(PACKAGE, '12'),
         str_detect(PACKAGE, 'ONE'))

 


#unique PACKAGE string from innovation
print(unique(innovation$PACKAGE))
## [1] "12SMALL 12ONE CUP"          "12SMALL 18ONE CUP"         
## [3] "12SMALL 24ONE CUP"          "12SMALL 6ONE CUP"          
## [5] "12SMALL 20ONE CUP"          "12SMALL 24ONE PLASTICS JUG"
library(dplyr)
library(lubridate)

innovation <- innovation %>%
  mutate(
    MONTH = month(ymd(DATE)),  # Extract month using lubridate's ymd function
    MONTH = as.factor(MONTH)   # Convert the extracted month into a factor
  )

str(innovation)
## 'data.frame':    5157 obs. of  13 variables:
##  $ MARKET_KEY     : chr  "806" "915" "331" "953" ...
##  $ DATE           : chr  "2022-07-23" "2021-07-10" "2023-10-28" "2023-06-10" ...
##  $ CALORIC_SEGMENT: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY       : chr  "SSD" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES     : num  7 3 19 4 77 78 69 2 12 4 ...
##  $ DOLLAR_SALES   : num  38.43 9.87 165.81 43.92 469.72 ...
##  $ MANUFACTURER   : chr  "JOLLYS" "BEARS" "JOLLYS" "JOLLYS" ...
##  $ BRAND          : chr  "HILL MOISTURE THRASHED APPLE" "SINGLE GROUP" "BEAUTIFUL GREENER" "BEAUTIFUL GREENER" ...
##  $ PACKAGE        : chr  "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 18ONE CUP" "12SMALL 24ONE CUP" ...
##  $ ITEM           : chr  "RAINING GENTLE DRINK THRASHED PLUM  CUP 12 LIQUID SMALL X12" "ZIZZLES GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X12" "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X18" "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X24" ...
##  $ REGION         : chr  "SOCAL" "DESERT_SW" "SOCAL" "ARIZONA" ...
##  $ MONTH          : Factor w/ 12 levels "1","2","3","4",..: 7 7 10 6 5 3 9 7 5 2 ...
##  $ SEASON         : chr  "SUMMER" "SUMMER" "FALL" "SUMMER" ...
print(unique(innovation$ITEM))
## [1] "RAINING GENTLE DRINK THRASHED PLUM  CUP 12 LIQUID SMALL X12" 
## [2] "ZIZZLES GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X12"          
## [3] "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X18"
## [4] "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X24"
## [5] "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X12"
## [6] "RAINING GENTLE DRINK THRASHED PLUM  CUP 12 LIQUID SMALL"     
## [7] "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL X20"
## [8] "BEAUTIFUL GREENER GENTLE DRINK PLUM  CUP 12 LIQUID SMALL"    
## [9] "BEAUTIFUL GREENER GENTLE DRINK PLUM  JUG 12 LIQUID SMALL X24"
# Count the number of unique PACKAGE column of our sample
table(innovation$PACKAGE)
## 
##          12SMALL 12ONE CUP          12SMALL 18ONE CUP 
##                       4470                        158 
##          12SMALL 20ONE CUP          12SMALL 24ONE CUP 
##                         47                        419 
## 12SMALL 24ONE PLASTICS JUG           12SMALL 6ONE CUP 
##                         30                         33
# Creating an 'innovation' data frame
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + SEASON + REGION, data = innovation)
summary(model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + 
##     SEASON + REGION, data = innovation)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1713.7   -20.3     7.2    36.1  4054.2 
## 
## Coefficients: (1 not defined because of singularities)
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -43.94160    6.80689  -6.455 1.18e-10 ***
## UNIT_SALES                          5.21325    0.02878 181.161  < 2e-16 ***
## CALORIC_SEGMENT                          NA         NA      NA       NA    
## PACKAGE12SMALL 18ONE CUP           82.68936   14.85976   5.565 2.76e-08 ***
## PACKAGE12SMALL 20ONE CUP           45.36041   25.20527   1.800 0.071976 .  
## PACKAGE12SMALL 24ONE CUP          278.66657    9.04304  30.816  < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG  78.90410   31.40814   2.512 0.012028 *  
## PACKAGE12SMALL 6ONE CUP           -27.55356   29.95315  -0.920 0.357674    
## SEASONSPRING                       17.21712    6.77551   2.541 0.011080 *  
## SEASONSUMMER                       23.75171    6.73407   3.527 0.000424 ***
## SEASONWINTER                        9.11365    6.95267   1.311 0.189979    
## REGIONCALI_NEVADA                  24.74310   14.96553   1.653 0.098323 .  
## REGIONCOLORADO                     40.01146    8.34877   4.793 1.69e-06 ***
## REGIONDESERT_SW                    33.93184   10.13193   3.349 0.000817 ***
## REGIONKANSAS                      182.94177   17.53758  10.431  < 2e-16 ***
## REGIONMOUNTAIN                     20.75593    9.47696   2.190 0.028558 *  
## REGIONNEWMEXICO                    27.12353   13.03611   2.081 0.037516 *  
## REGIONNOCAL                       -10.41959   10.94085  -0.952 0.340961    
## REGIONNORTHERN                     39.43195    7.81663   5.045 4.70e-07 ***
## REGIONPRAIRIE                      33.61715   16.23417   2.071 0.038431 *  
## REGIONSOCAL                       -25.08608    8.17700  -3.068 0.002167 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 170.3 on 5137 degrees of freedom
## Multiple R-squared:  0.8777, Adjusted R-squared:  0.8773 
## F-statistic:  1941 on 19 and 5137 DF,  p-value: < 2.2e-16

Good gravy 0.87 R-squared. This model is a beast. PACKAGE12SMALL 24ONE CUP and KANSAS are the most significant variables. Seasonally Spring and Summer are the most signficant and best times to sell.

#More exploration

library(dplyr)

small_group <- df %>%
  filter(UNIT_SALES < 600, DOLLAR_SALES < 3600)

skim(small_group)
Data summary
Name small_group
Number of rows 2303607
Number of columns 13
_______________________
Column type frequency:
character 9
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
DATE 0 1 10 10 0 152 0
CATEGORY 0 1 3 18 0 5 0
MANUFACTURER 0 1 5 8 0 8 0
BRAND 0 1 4 56 0 288 0
PACKAGE 0 1 11 26 0 95 0
ITEM 0 1 26 142 0 2999 0
REGION 0 1 5 11 0 11 0
SEASON 0 1 4 6 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.49 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
UNIT_SALES 0 1 80.31 110.56 0.04 10.00 35.00 102.00 599.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 284.84 432.53 0.01 33.12 118.54 342.48 3599.94 ▇▁▁▁▁
MONTH 0 1 6.28 3.44 1.00 3.00 6.00 9.00 12.00 ▇▆▆▅▇

Our sample looks farily representative as the mean and sd are quite close to the full df for DIET SMASH.

# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(small_group, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
  geom_point(alpha = 0.5) +  # Adjust alpha to avoid overplotting, if necessary
  geom_smooth(method = "lm", color = "black", se = FALSE) +  # Add linear regression line without confidence band for clarity
  labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
       x = "UNTI SALES",
       y = "DOLLAR SALES") +
  theme_minimal() +
  theme(legend.position = "bottom")  # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'

Behold the realm of DIET SMASH. Certain items sell much better, or wosrse with consideration of slop of dollars to units sold. The overall trend line in this realm is below that of DIET SMASH, as DIET SMASH is almost $1000 at 200 units sold while the realm is 375 units sold to get to $1000.

#Make the small plum df > Investigating drinks with Plum as a flavor in the Item description.

# Create a new data frame with only the rows where the ITEM column contains the word 'plum'
plum_small <- df[grep("plum", df$ITEM, ignore.case = TRUE), ]
skim(plum_small)
Data summary
Name plum_small
Number of rows 28981
Number of columns 13
_______________________
Column type frequency:
character 9
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
DATE 0 1 10 10 0 152 0
CATEGORY 0 1 3 18 0 4 0
MANUFACTURER 0 1 5 8 0 6 0
BRAND 0 1 5 30 0 21 0
PACKAGE 0 1 12 26 0 20 0
ITEM 0 1 48 126 0 52 0
REGION 0 1 5 11 0 11 0
SEASON 0 1 4 6 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.60 0.49 0.0 0.00 1.00 1.00 1.00 ▆▁▁▁▇
UNIT_SALES 0 1 57.73 182.93 1.0 7.00 23.00 62.00 19588.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 168.67 545.14 0.5 17.64 61.28 173.03 56603.31 ▇▁▁▁▁
MONTH 0 1 6.34 3.51 1.0 3.00 6.00 10.00 12.00 ▇▆▅▃▇

Plum as a flavor has lower unit sales mean of 58 and dollar sales mean of $169 as compared to overall Diet Smash at 80 mean unit sales and mean of $235 dolalr sales.

# plum small is dataframe
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + CATEGORY + SEASON + REGION, data = plum_small)
summary(model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + 
##     CATEGORY + SEASON + REGION, data = plum_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3296.9   -35.7     5.2    42.1  6282.8 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.662e+02  1.101e+01 -15.092  < 2e-16 ***
## UNIT_SALES                         2.855e+00  4.672e-03 611.008  < 2e-16 ***
## CALORIC_SEGMENT                    2.254e+01  3.289e+00   6.854 7.35e-12 ***
## PACKAGE12SMALL 12ONE CUP           1.104e+02  8.626e+00  12.799  < 2e-16 ***
## PACKAGE12SMALL 18ONE CUP           5.029e+01  1.389e+01   3.621 0.000294 ***
## PACKAGE12SMALL 20ONE CUP           2.126e+01  2.202e+01   0.965 0.334501    
## PACKAGE12SMALL 24ONE CUP           3.102e+02  1.031e+01  30.093  < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG  3.357e+01  2.683e+01   1.251 0.210785    
## PACKAGE12SMALL 6ONE CUP           -2.337e+01  2.577e+01  -0.907 0.364466    
## PACKAGE12SMALL 8ONE CUP            1.411e+02  1.017e+01  13.881  < 2e-16 ***
## PACKAGE12SMALL MLT PLASTICS JUG   -6.406e+01  9.688e+00  -6.612 3.86e-11 ***
## PACKAGE12SMALL MULTI CUP           1.308e+02  1.091e+01  11.985  < 2e-16 ***
## PACKAGE16SMALL 12ONE CUP           1.705e+02  2.270e+01   7.509 6.14e-14 ***
## PACKAGE16SMALL 4ONE CUP            1.716e+02  1.564e+01  10.973  < 2e-16 ***
## PACKAGE16SMALL MULTI CUP           8.494e+01  1.044e+01   8.133 4.35e-16 ***
## PACKAGE20SMALL MULTI JUG          -6.308e+01  8.786e+00  -7.179 7.17e-13 ***
## PACKAGE24SMALL MLT SHADYES JUG    -3.669e+01  2.074e+01  -1.769 0.076839 .  
## PACKAGE24SMALL MULTI CUP           1.658e+02  1.961e+01   8.459  < 2e-16 ***
## PACKAGE2L MULTI JUG               -9.688e+01  8.963e+00 -10.809  < 2e-16 ***
## PACKAGE8SMALL 4ONE CUP             2.530e+02  1.215e+01  20.825  < 2e-16 ***
## PACKAGE8SMALL MULTI CUP            9.426e+01  1.165e+01   8.092 6.09e-16 ***
## PACKAGEALL OTHER ONES              9.939e+01  1.070e+01   9.290  < 2e-16 ***
## CATEGORYING ENHANCED WATER         2.111e+02  1.022e+01  20.652  < 2e-16 ***
## CATEGORYSPARKLING WATER            6.433e+01  4.377e+00  14.698  < 2e-16 ***
## CATEGORYSSD                        1.509e+02  6.341e+00  23.803  < 2e-16 ***
## SEASONSPRING                       1.835e+00  2.363e+00   0.777 0.437353    
## SEASONSUMMER                       4.347e+00  2.447e+00   1.776 0.075661 .  
## SEASONWINTER                      -4.054e+00  2.367e+00  -1.713 0.086711 .  
## REGIONCALI_NEVADA                  6.767e+00  4.883e+00   1.386 0.165813    
## REGIONCOLORADO                     2.440e+01  2.874e+00   8.492  < 2e-16 ***
## REGIONDESERT_SW                    2.447e+00  3.705e+00   0.661 0.508907    
## REGIONKANSAS                       3.333e+01  5.680e+00   5.868 4.47e-09 ***
## REGIONMOUNTAIN                     7.940e+00  3.202e+00   2.479 0.013171 *  
## REGIONNEWMEXICO                    1.678e+01  4.336e+00   3.870 0.000109 ***
## REGIONNOCAL                        2.055e+00  4.092e+00   0.502 0.615463    
## REGIONNORTHERN                    -6.459e+00  2.562e+00  -2.522 0.011688 *  
## REGIONPRAIRIE                      8.138e+00  5.232e+00   1.555 0.119841    
## REGIONSOCAL                        1.362e+00  3.126e+00   0.436 0.662929    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 139.8 on 28943 degrees of freedom
## Multiple R-squared:  0.9343, Adjusted R-squared:  0.9343 
## F-statistic: 1.113e+04 on 37 and 28943 DF,  p-value: < 2.2e-16

Adjusted R squared of 0.934, just amazing. Plum does well in Colorado, Kansas, and New Mexico (the last two being similar to Diet Smash). The two existing packages that Diet Smash comes in also are significant, for Plum (2L Multi JUG fairly strong negative like Diet Smash and 12small 12one CUP just slightly positive). The 4 week package run, PACKAGE12SMALL 6ONE CUP, from Diet Smash is not statisically signficant for Plum either, which could be cause of concern for what sounds to be a similar package of 11small 4one “JUG|CUP”. Although “All Other Ones”, could be a good proxy for innovation packaging and is strongly signficant. The only thing that is not significant across the board is the season for plum, although Summer and Winter could be edge cases thrown off by some noise.

Cleaning

Reworking the subset plum for more feature engineering.

plum_small <- plum_small %>%
  mutate(
    PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"),  # Extracts the part from CUP or JUG to the end.
    ITEM = str_replace(ITEM, "(CUP|JUG).*", "")  # Replaces the CUP/JUG and everything after it with empty string in ITEM.
  )


#plum_small
plum_small <- plum_small %>%
  mutate(
    TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
    PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
    ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
    TEMP = NULL  # Removes the temporary column.
  )

#plum_small
na_rows <- plum_small %>%
  filter(is.na(PACKAGE2))
#na_rows
#the above steps excised all packaging out of ITEM column
plum_small <- plum_small %>%
  mutate(
    GENTLE_DRINK = if_else(str_detect(ITEM, "GENTLE DRINK"), 1, 0), # Assigns 1 if "GENTLE DRINK" exists, otherwise 0.
    ITEM = str_replace(ITEM, "GENTLE DRINK", "") # Removes "GENTLE DRINK" from ITEM.
  )
#plum_small
plum_small <- plum_small %>%
  mutate(
    ENERGY_DRINK = if_else(str_detect(ITEM, "ENERGY DRINK"), 1, 0), # Assigns 1 if "ENERGY DRINK" exists, otherwise 0.
    ITEM = str_replace(ITEM, "ENERGY DRINK", "") # Removes "ENERGY DRINK" from ITEM.
  )

#plum_small
library(dplyr)
library(stringr)

# Define the pattern as a regular expression
pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES"

plum_small <- plum_small %>%
  mutate(
    CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
    ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
  )

#plum_small
library(dplyr)
library(stringr)

plum_small <- plum_small %>%
  mutate(
    CALORIC_SEGMENT_TEXT = if_else(str_detect(ITEM, "\\bDIET\\b"), 
                                   if_else(is.na(CALORIC_SEGMENT_TEXT), "DIET", paste(CALORIC_SEGMENT_TEXT, "DIET", sep=", ")), 
                                   CALORIC_SEGMENT_TEXT)
  )
#plum_small
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
  words <- unlist(str_split(item, "\\s+")) # Split item into words
  unique_words <- unique(words) # Get unique words to check for repeats
  for (word in unique_words) {
    word_indices <- which(words == word) # Find all indices of the current word
    if (length(word_indices) > 1) { # If there is more than one occurrence
      words[word_indices[2]] <- "" # Remove the second occurrence
    }
  }
  return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}

# Apply the function to the 'ITEM' column
plum_small <- plum_small %>%
  mutate(ITEM = sapply(ITEM, remove_second_instance))


# Remove specific columns
plum_small <- select(plum_small, -PACKAGE2, -GENTLE_DRINK, -ENERGY_DRINK, -CALORIC_SEGMENT_TEXT)
head(plum_small)
##          MARKET_KEY       DATE CALORIC_SEGMENT           CATEGORY UNIT_SALES
## 9204434         424 2021-06-12               1             ENERGY        115
## 17627306        806 2022-02-26               1             ENERGY         36
## 18144465        831 2023-03-25               1                SSD         93
## 647833           32 2022-01-15               0    SPARKLING WATER         12
## 1090185          56 2021-02-06               0 ING ENHANCED WATER         45
## 17667230        806 2022-07-23               1                SSD          7
##          DOLLAR_SALES MANUFACTURER                          BRAND
## 9204434        186.10       JOLLYS            SUPER-DUPER PUNCHED
## 17627306        74.77       JOLLYS            SUPER-DUPER PUNCHED
## 18144465       204.19       JOLLYS              BEAUTIFUL GREENER
## 647833          43.29        BEARS                          CROWN
## 1090185         53.53       JOLLYS SOOOO-COOOOL FUTURE WATER ZERO
## 17667230        38.43       JOLLYS   HILL MOISTURE THRASHED APPLE
##                    PACKAGE
## 9204434  16SMALL MULTI CUP
## 17627306 16SMALL MULTI CUP
## 18144465 20SMALL MULTI JUG
## 647833    12SMALL 8ONE CUP
## 1090185  20SMALL MULTI JUG
## 17667230 12SMALL 12ONE CUP
##                                                                      ITEM
## 9204434                              SUPER-DUPER PITAYA ED HARDONLY PLUM 
## 17627306                             SUPER-DUPER PITAYA ED HARDONLY PLUM 
## 18144465                                          BEAUTIFUL GREENER PLUM 
## 647833                                  CROWN SPARKLING WATER BEACH PLUM 
## 1090185  SOOOO-COOOOL FUTURE WATER  BEVERAGE FUJI PLUM KEEN ZERO CAL PER 
## 17667230                                           RAINING THRASHED PLUM 
##            REGION MONTH SEASON
## 9204434  NORTHERN     6 SUMMER
## 17627306    SOCAL     2 WINTER
## 18144465  ARIZONA     3 SPRING
## 647833   NORTHERN     1 WINTER
## 1090185  NORTHERN     2 WINTER
## 17667230    SOCAL     7 SUMMER

FINAL THOUGHTS

We now know that there are 2 innovation aspects at play here, a new package for Diet Smash in the complete sense of 11small and 4one and an existing flavor,net new added to Diet Smash. Both our innovation plum form and the small Plum Multiple Linear regressions are incredibly high indicating that there is high potential some reasonable forecasting. Diet Smash has some significance in terms of season in Summer and Winter, but Plum by itself is right on the edge of not being significant for Summer and Winter. Our innovation data frame showed promised spring and summer. Packaging is not as strong by itself as Plum and Diet Smash ony comes in 2 regular types with 1 size that ran for 4 weeks. 12small 6one is likely pretty close for 11small 4one, but regressions in the innovation data frame showed PACKAGE12SMALL 24ONE CUP as the winner . It’s possible that together, based on Diet Smash, that the best 13 weeks are Spring and Summer for Plum, with specific week details to be determined.

df <- read_csv("swire_no_nas_w_pop.csv")  #inject the data and we will sub-sample
## Rows: 24461424 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): CALORIC_SEGMENT, CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM
## dbl  (4): MARKET_KEY, UNIT_SALES, DOLLAR_SALES, POP_SQMI
## date (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Remove things at the very beginning and drop the weird missing period near the end

#Group by ITEM with DATE Before 2021-01-01, drop those ITEM rows
# df_long_running <- df %>%
#   group_by(ITEM) %>%
#   filter(DATE <= "2021-01-01")
# 
# #remove all rows in plum_long_running from plum
# df <- df %>%
#   anti_join(df_long_running)

#Group by ITEM rows with less than 12 weeks of data
df_small <- df %>%
  group_by(ITEM) %>%
  filter(n() <= 12)

#remove all rows in df small that run less than 13 weeks 
df <- df %>%
  anti_join(df_small)
## Joining with `by = join_by(MARKET_KEY, DATE, CALORIC_SEGMENT, CATEGORY,
## UNIT_SALES, DOLLAR_SALES, MANUFACTURER, BRAND, PACKAGE, ITEM, POP_SQMI)`
#Drop rows after May 21st 2023 as there are several gaps for most brands in innovation plum
df <- df %>%
  filter(DATE <= "2023-05-21")

#cleanup everything but df
rm(df_small)

#skim(df)
#start with PLUM, SSD, and Package features
plum_package <- df %>%
  filter(CATEGORY == "SSD",
         str_detect(ITEM, "PLUM"))

#try plum ssd diet
plum_diet <- df %>%
  filter (
         str_detect(ITEM, "PLUM"),
         CALORIC_SEGMENT == "DIET/LIGHT")

#toss in some diet smash for good measure ans some package features
diet_smash <- df %>%
  filter (
         BRAND == "DIET SMASH",
         CATEGORY == "SSD")
         

#combine the three
merged_plum_innovation <- bind_rows(plum_package, plum_diet, diet_smash)

#remove duplicate rows
df <- merged_plum_innovation %>% distinct()

skim(df)
Data summary
Name df
Number of rows 215151
Number of columns 11
_______________________
Column type frequency:
character 6
Date 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
CALORIC_SEGMENT 0 1 7 10 0 2 0
CATEGORY 0 1 3 18 0 4 0
MANUFACTURER 0 1 5 8 0 5 0
BRAND 0 1 5 30 0 15 0
PACKAGE 0 1 12 26 0 18 0
ITEM 0 1 48 126 0 39 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DATE 0 1 2020-12-05 2023-05-20 2022-04-09 129

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
MARKET_KEY 0 1 630.96 613.44 1.00 303.00 613.00 882.00 6802.00 ▇▁▁▁▁
UNIT_SALES 0 1 46.29 76.20 1.00 7.00 22.00 55.00 3157.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 138.90 260.51 0.03 18.00 60.39 156.86 12763.23 ▇▁▁▁▁
POP_SQMI 0 1 1548.17 1855.52 0.18 47.98 709.34 2774.91 6769.35 ▇▂▂▁▁
#cleanup all objects other than df
rm(plum_package, plum_diet, diet_smash, merged_plum_innovation)
regions_joinme <- read.csv("states_summary.csv")

unique(regions_joinme$REGION)
##  [1] "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA" "MOUNTAIN"   
##  [6] "SOCAL"       "ARIZONA"     "NEWMEXICO"   "NOCAL"       "COLORADO"   
## [11] "KANSAS"
# "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA"  "MOUNTAIN"    "SOCAL"   "ARIZONA"    "NEWMEXICO"   "NOCAL"    "COLORADO"    "KANSAS" 

str(regions_joinme)
## 'data.frame':    200 obs. of  2 variables:
##  $ MARKET_KEY: int  13 70 179 197 272 352 32 33 44 50 ...
##  $ REGION    : chr  "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
  mutate(
    MONTH = as.numeric(substr(DATE, 6, 7)),  # Extract the month from YYYY-MM-DD format
    SEASON = case_when(
      MONTH %in% c(12, 01, 02) ~ "WINTER",
      MONTH %in% c(03, 04, 05) ~ "SPRING",
      MONTH %in% c(06, 07, 08) ~ "SUMMER",
      MONTH %in% c(09, 10, 11) ~ "FALL",
      TRUE ~ NA_character_  # This is just in case there are any undefined values
    )
  )

Cleaning Plum

#save merged_innovation_df back to plum
plum  <- df

skim(plum)
Data summary
Name plum
Number of rows 215151
Number of columns 14
_______________________
Column type frequency:
character 8
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
CATEGORY 0 1 3 18 0 4 0
MANUFACTURER 0 1 5 8 0 5 0
BRAND 0 1 5 30 0 15 0
PACKAGE 0 1 12 26 0 18 0
ITEM 0 1 48 126 0 39 0
REGION 0 1 5 11 0 11 0
SEASON 0 1 4 6 0 4 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DATE 0 1 2020-12-05 2023-05-20 2022-04-09 129

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.44 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
UNIT_SALES 0 1 46.29 76.20 1.00 7.00 22.00 55.00 3157.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 138.90 260.51 0.03 18.00 60.39 156.86 12763.23 ▇▁▁▁▁
POP_SQMI 0 1 1548.17 1855.52 0.18 47.98 709.34 2774.91 6769.35 ▇▂▂▁▁
MONTH 0 1 6.20 3.57 1.00 3.00 6.00 9.00 12.00 ▇▆▅▃▇

Reworking the subset plum for more feature engineering.

plum <- plum %>%
  mutate(
    PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"),  # Extracts the part from CUP or JUG to the end.
    ITEM = str_replace(ITEM, "(CUP|JUG).*", "")  # Replaces the CUP/JUG and everything after it with empty string in ITEM.
  )
plum <- plum %>%
  mutate(
    TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
    PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
    ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
    TEMP = NULL  # Removes the temporary column.
  )
na_rows <- plum %>%
  filter(is.na(PACKAGE2))
na_rows
##  [1] MARKET_KEY      DATE            CALORIC_SEGMENT CATEGORY       
##  [5] UNIT_SALES      DOLLAR_SALES    MANUFACTURER    BRAND          
##  [9] PACKAGE         ITEM            POP_SQMI        REGION         
## [13] MONTH           SEASON          PACKAGE2       
## <0 rows> (or 0-length row.names)
#the above steps excised all packaging out of ITEM column
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
  words <- unlist(str_split(item, "\\s+")) # Split item into words
  unique_words <- unique(words) # Get unique words to check for repeats
  for (word in unique_words) {
    word_indices <- which(words == word) # Find all indices of the current word
    if (length(word_indices) > 1) { # If there is more than one occurrence
      words[word_indices[2]] <- "" # Remove the second occurrence
    }
  }
  return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}

# Apply the function to the 'ITEM' column
plum <- plum %>%
  mutate(ITEM = sapply(ITEM, remove_second_instance))
# #One hot encode either "ENERGY" or "ED" in ITEM as an ENERGY_DRINK
plum$ENERGY_DRINK <- ifelse(str_detect(plum$ITEM, "ENERGY|' ED'"), 1, 0)

plum$ITEM <- str_replace(plum$ITEM, "ENERGY DRINK", "")
plum$ITEM <- str_replace(plum$ITEM, "ENERGY", "")
plum$ITEM <- str_replace(plum$ITEM, " ED", "")
table(plum$ENERGY_DRINK)
## 
##      0      1 
## 176398  38753
table(plum$CATEGORY)
## 
##             ENERGY ING ENHANCED WATER    SPARKLING WATER                SSD 
##              38753               3760              63752             108886
plum %>%
  filter(ENERGY_DRINK == 1,
         CATEGORY=='SSD') %>%
  select(ITEM) %>%
  head(10)
## [1] ITEM
## <0 rows> (or 0-length row.names)
# Remove specific columns
#plum <- select(plum, -PACKAGE2, -CATEGORY)
head(plum)
##   MARKET_KEY       DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1          1 2022-06-18               1      SSD          1         4.62
## 2          1 2022-04-30               1      SSD         14        86.86
## 3          1 2021-12-11               1      SSD         18        89.73
## 4          1 2022-07-30               1      SSD         13        65.60
## 5          1 2021-11-27               1      SSD         19        72.93
## 6          1 2022-06-11               1      SSD          4        25.60
##   MANUFACTURER                        BRAND           PACKAGE
## 1       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 2       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 3       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 4       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 5       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 6       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
##                                  ITEM POP_SQMI   REGION MONTH SEASON
## 1 RAINING GENTLE DRINK THRASHED PLUM  1.201114 NORTHERN     6 SUMMER
## 2 RAINING GENTLE DRINK THRASHED PLUM  1.201114 NORTHERN     4 SPRING
## 3 RAINING GENTLE DRINK THRASHED PLUM  1.201114 NORTHERN    12 WINTER
## 4 RAINING GENTLE DRINK THRASHED PLUM  1.201114 NORTHERN     7 SUMMER
## 5 RAINING GENTLE DRINK THRASHED PLUM  1.201114 NORTHERN    11   FALL
## 6 RAINING GENTLE DRINK THRASHED PLUM  1.201114 NORTHERN     6 SUMMER
##                     PACKAGE2 ENERGY_DRINK
## 1 CUP 12 LIQUID SMALL X12 NA            0
## 2 CUP 12 LIQUID SMALL X12 NA            0
## 3 CUP 12 LIQUID SMALL X12 NA            0
## 4 CUP 12 LIQUID SMALL X12 NA            0
## 5 CUP 12 LIQUID SMALL X12 NA            0
## 6 CUP 12 LIQUID SMALL X12 NA            0
table(plum$ITEM)
## 
##                                                       BEAUTIFUL GREENER GENTLE DRINK PLUM  
##                                                                                      63882 
##                                             BUBBLE JOY SPARKLING WATER PLUM   NO CALORIES  
##                                                                                       2354 
##                                      CROWN CURATE SPARKLING WATER PLUM BERRY CALORIE FREE  
##                                                                                         31 
##                               CROWN CURATE SPARKLING WATER PLUM MIXED-TROPPY ZERO CALORIE  
##                                                                                       3134 
##                                             CROWN SPARKLING WATER BEACH PLUM ZERO CALORIE  
##                                                                                      25354 
##                                                           DIET SMASH GENTLE DRINK SUNSET   
##                                                                                      14704 
##                                          DIGRESS NOURISH ENHANCE WATER BEVERAGE PLUM KEEN  
##                                                                                       2324 
## EXCLAMATION REFRESHER DRINK FLAVORED SPARKLING WATER FUJI PLUM PLUS WHITE   NO SWEETENERS  
##                                                                                      21216 
##                        EXCLAMATION REFRESHER SPARKLING WATER PLUM PLUS GINGER NO CALORIES  
##                                                                                        959 
##                                                 FANTASMIC GENTLE DRINK CUSTARD APPLE PLUM  
##                                                                                         15 
##                                                               FANTASMIC GENTLE DRINK PLUM  
##                                                                                         52 
##                                       FIZZY SPARKLING JUICE BEVERAGE  PLUM NO ADDED SUGAR  
##                                                                                      10704 
##                                                                       GO-DAY GENTLE DRINK  
##                                                                                        499 
##                                                                                  POW-POW   
##                                                                                        424 
##                                                        RAINING GENTLE DRINK THRASHED PLUM  
##                                                                                      26462 
##                           SOOOO-COOOOL FUTURE WATER  BEVERAGE FUJI PLUM KEEN ZERO CAL PER  
##                                                                                       1436 
##                           SUPER-DUPER RUN-QUICK  SUPER SOUR CUSTARD APPLE PLUM SUGAR FREE  
##                                                                                      22970 
##                                     SUPER-DUPER RUN-QUICK  SUPER SOURS CUSTARD APPLE PLUM  
##                                                                                        851 
##                          SUPER-DUPER RUN-QUICK  SUPER SOURS CUSTARD APPLE PLUM SUGAR FREE  
##                                                                                       1488 
##                          SUPER-DUPER RUN-QUICK  SUPER SOURS CUSTARD APPLE PLUM ZERO SUGAR  
##                                                                                       9591 
##                                                                 VENOMOUS BLAST  PINK PLUM  
##                                                                                       3429 
##                                                                 ZIZZLES GENTLE DRINK PLUM  
##                                                                                       3272
#Trim trailing white space at end of ITEM
plum$ITEM <- str_trim(plum$ITEM, side = "right")
# #replace "ENERGY DRINK" with "" in ITEM
plum$ITEM <- str_replace(plum$ITEM, "GENTLE DRINK", "")
pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES|ZERO CARB|PURE ZERO|DIET|NO SWEETENERS|ZERO CAL PER|CALORIE FREE"

plum <- plum %>%
  mutate(
    CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
    ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
  )
# Find the minimum launch date for each product
min_launch_dates <- plum %>%
  group_by(ITEM) %>%
  summarise(min_launch_date = min(DATE))

# Join the minimum launch dates back to the original data
plum <- plum %>%
  left_join(min_launch_dates, by = "ITEM")

# Calculate the number of weeks since the product launch
plum <- plum %>%
  mutate(WEEKS_SINCE_LAUNCH = as.numeric(difftime(DATE, min_launch_date, units = "weeks")))

# Selecting required columns and printing the first 10 rows
plum %>%
  filter(UNIT_SALES > 0) %>%
  select(DATE, ITEM, WEEKS_SINCE_LAUNCH) %>%
  head(10)
##          DATE                   ITEM WEEKS_SINCE_LAUNCH
## 1  2022-06-18 RAINING  THRASHED PLUM                 41
## 2  2022-04-30 RAINING  THRASHED PLUM                 34
## 3  2021-12-11 RAINING  THRASHED PLUM                 14
## 4  2022-07-30 RAINING  THRASHED PLUM                 47
## 5  2021-11-27 RAINING  THRASHED PLUM                 12
## 6  2022-06-11 RAINING  THRASHED PLUM                 40
## 7  2022-01-22 RAINING  THRASHED PLUM                 20
## 8  2021-11-06 RAINING  THRASHED PLUM                  9
## 9  2021-10-16 RAINING  THRASHED PLUM                  6
## 10 2021-12-04 RAINING  THRASHED PLUM                 13
#Set any negative WEEKS_SINCE_LAUNCH to 0
# plum <- plum %>%
#   mutate(WEEKS_SINCE_LAUNCH = ifelse(WEEKS_SINCE_LAUNCH < 0, 0, WEEKS_SINCE_LAUNCH))
#too many brands and packages, lets try removing ITEM or CATEGORIES of WATER, ENERGY, and JUG packages
plum <- plum %>% 
  filter(!grepl("WATER", ITEM),
         !CATEGORY=='SPARKLING WATER',
         !CATEGORY=='ING ENHANCED WATER',
         !CATEGORY=='ENERGY')

#Let's remove Brand Diet Smash
# plum <- plum %>% 
#   filter(!BRAND=='DIET SMASH')
print(unique(plum$ITEM))
## [1] "RAINING  THRASHED PLUM"        "BEAUTIFUL GREENER  PLUM"      
## [3] " SMASH  SUNSET"                "FANTASMIC  PLUM"              
## [5] "ZIZZLES  PLUM"                 "GO-DAY "                      
## [7] "FANTASMIC  CUSTARD APPLE PLUM"
print(unique(plum$BRAND))
## [1] "HILL MOISTURE THRASHED APPLE" "BEAUTIFUL GREENER"           
## [3] "DIET SMASH"                   "FANTASMIC"                   
## [5] "SINGLE GROUP"                 "GO-DAY"
print(unique(plum$CATEGORY))
## [1] "SSD"
print(unique(plum$PACKAGE))
##  [1] "12SMALL 12ONE CUP"          "20SMALL MULTI JUG"         
##  [3] "2L MULTI JUG"               "12SMALL 6ONE CUP"          
##  [5] "12SMALL 24ONE CUP"          "12SMALL MLT PLASTICS JUG"  
##  [7] "12SMALL 20ONE CUP"          ".5L 6ONE JUG"              
##  [9] "12SMALL 18ONE CUP"          "12SMALL 24ONE PLASTICS JUG"
## [11] "24SMALL MLT SHADYES JUG"
write_csv(plum, "plum_tableau.csv")

str(plum)
## 'data.frame':    108886 obs. of  19 variables:
##  $ MARKET_KEY          : chr  "1" "1" "1" "1" ...
##  $ DATE                : Date, format: "2022-06-18" "2022-04-30" ...
##  $ CALORIC_SEGMENT     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY            : chr  "SSD" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES          : num  1 14 18 13 19 4 29 35 75 25 ...
##  $ DOLLAR_SALES        : num  4.62 86.86 89.73 65.6 72.93 ...
##  $ MANUFACTURER        : chr  "JOLLYS" "JOLLYS" "JOLLYS" "JOLLYS" ...
##  $ BRAND               : chr  "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
##  $ PACKAGE             : chr  "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
##  $ ITEM                : chr  "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" ...
##  $ POP_SQMI            : num  1.2 1.2 1.2 1.2 1.2 ...
##  $ REGION              : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ MONTH               : num  6 4 12 7 11 6 1 11 10 12 ...
##  $ SEASON              : chr  "SUMMER" "SPRING" "WINTER" "SUMMER" ...
##  $ PACKAGE2            : chr  "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
##  $ ENERGY_DRINK        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CALORIC_SEGMENT_TEXT: chr  NA NA NA NA ...
##  $ min_launch_date     : Date, format: "2021-09-04" "2021-09-04" ...
##  $ WEEKS_SINCE_LAUNCH  : num  41 34 14 47 12 40 20 9 6 13 ...
#remove all objects other than plum
rm(list = setdiff(ls(), "plum"))

print(unique(plum$ITEM))
## [1] "RAINING  THRASHED PLUM"        "BEAUTIFUL GREENER  PLUM"      
## [3] " SMASH  SUNSET"                "FANTASMIC  PLUM"              
## [5] "ZIZZLES  PLUM"                 "GO-DAY "                      
## [7] "FANTASMIC  CUSTARD APPLE PLUM"
print(unique(plum$BRAND))
## [1] "HILL MOISTURE THRASHED APPLE" "BEAUTIFUL GREENER"           
## [3] "DIET SMASH"                   "FANTASMIC"                   
## [5] "SINGLE GROUP"                 "GO-DAY"
print(unique(plum$CATEGORY))
## [1] "SSD"
print(unique(plum$PACKAGE))
##  [1] "12SMALL 12ONE CUP"          "20SMALL MULTI JUG"         
##  [3] "2L MULTI JUG"               "12SMALL 6ONE CUP"          
##  [5] "12SMALL 24ONE CUP"          "12SMALL MLT PLASTICS JUG"  
##  [7] "12SMALL 20ONE CUP"          ".5L 6ONE JUG"              
##  [9] "12SMALL 18ONE CUP"          "12SMALL 24ONE PLASTICS JUG"
## [11] "24SMALL MLT SHADYES JUG"
skim(plum)
Data summary
Name plum
Number of rows 108886
Number of columns 19
_______________________
Column type frequency:
character 10
Date 2
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1.00 1 4 0 197 0
CATEGORY 0 1.00 3 3 0 1 0
MANUFACTURER 0 1.00 5 8 0 4 0
BRAND 0 1.00 6 28 0 6 0
PACKAGE 0 1.00 12 26 0 11 0
ITEM 0 1.00 7 29 0 7 0
REGION 0 1.00 5 11 0 11 0
SEASON 0 1.00 4 6 0 4 0
PACKAGE2 0 1.00 22 34 0 11 0
CALORIC_SEGMENT_TEXT 94182 0.14 4 4 0 1 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DATE 0 1 2020-12-05 2023-05-20 2022-04-09 129
min_launch_date 0 1 2020-12-05 2021-09-04 2020-12-05 2

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.86 0.34 0.00 1.00 1.00 1.00 1.00 ▁▁▁▁▇
UNIT_SALES 0 1 51.24 85.35 1.00 9.00 25.00 58.00 3157.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 171.84 327.25 0.31 27.12 74.65 184.04 12763.23 ▇▁▁▁▁
POP_SQMI 0 1 1723.52 1926.89 0.18 57.10 843.08 3191.93 6769.35 ▇▂▂▁▁
MONTH 0 1 6.21 3.61 1.00 3.00 6.00 10.00 12.00 ▇▆▃▃▇
ENERGY_DRINK 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
WEEKS_SINCE_LAUNCH 0 1 59.78 35.73 0.00 30.00 58.00 87.00 128.00 ▇▇▇▆▆
# Creating an 'innovation' data frame

#factor all character variables
plum$REGION <- as.factor(plum$REGION)
#plum$CATEGORY <- as.factor(plum$CATEGORY)
plum$BRAND <- as.factor(plum$BRAND)
plum$SEASON <- as.factor(plum$SEASON)
plum$PACKAGE2 <- as.factor(plum$PACKAGE2)
sapply(plum, function(x) sum(is.na(x)))
##           MARKET_KEY                 DATE      CALORIC_SEGMENT 
##                    0                    0                    0 
##             CATEGORY           UNIT_SALES         DOLLAR_SALES 
##                    0                    0                    0 
##         MANUFACTURER                BRAND              PACKAGE 
##                    0                    0                    0 
##                 ITEM             POP_SQMI               REGION 
##                    0                    0                    0 
##                MONTH               SEASON             PACKAGE2 
##                    0                    0                    0 
##         ENERGY_DRINK CALORIC_SEGMENT_TEXT      min_launch_date 
##                    0                94182                    0 
##   WEEKS_SINCE_LAUNCH 
##                    0
model <- lm(DOLLAR_SALES ~ UNIT_SALES  + POP_SQMI + REGION  + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
summary(model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + POP_SQMI + REGION + 
##     MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4455.9   -54.8    -0.3    53.3  7116.0 
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                -8.833e+01  1.002e+01  -8.817
## UNIT_SALES                                  3.167e+00  5.958e-03 531.591
## POP_SQMI                                   -1.581e-03  2.987e-04  -5.292
## REGIONCALI_NEVADA                           1.532e+01  2.986e+00   5.131
## REGIONCOLORADO                              3.600e+01  1.649e+00  21.829
## REGIONDESERT_SW                             2.789e-01  2.205e+00   0.126
## REGIONKANSAS                                1.249e+02  3.948e+00  31.639
## REGIONMOUNTAIN                              2.969e+01  1.963e+00  15.126
## REGIONNEWMEXICO                             3.618e+01  2.563e+00  14.118
## REGIONNOCAL                                -6.519e+00  2.362e+00  -2.760
## REGIONNORTHERN                              1.621e+01  1.691e+00   9.585
## REGIONPRAIRIE                               2.519e+01  3.605e+00   6.987
## REGIONSOCAL                                -1.047e+01  1.717e+00  -6.102
## MONTH                                      -1.278e-01  1.751e-01  -0.730
## SEASONSPRING                                3.390e+00  1.751e+00   1.936
## SEASONSUMMER                                6.594e+00  1.630e+00   4.047
## SEASONWINTER                                1.239e+01  1.638e+00   7.566
## PACKAGE2CUP 12 LIQUID SMALL X12 NA          1.237e+02  9.747e+00  12.691
## PACKAGE2CUP 12 LIQUID SMALL X18 NA          1.005e+02  1.074e+01   9.355
## PACKAGE2CUP 12 LIQUID SMALL X20 NA          1.141e+02  1.237e+01   9.219
## PACKAGE2CUP 12 LIQUID SMALL X24 NA          3.215e+02  1.019e+01  31.561
## PACKAGE2CUPDY PLUM  JUG 24 LIQUID SMALL NA  3.973e+00  1.215e+01   0.327
## PACKAGE2JUG 12 LIQUID SMALL NA             -1.917e+01  9.898e+00  -1.937
## PACKAGE2JUG 12 LIQUID SMALL X24 NA          8.781e+01  1.432e+01   6.133
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA         5.460e+01  1.032e+01   5.290
## PACKAGE2JUG 20 LIQUID SMALL NA             -3.349e+01  9.781e+00  -3.424
## PACKAGE2JUG 67.6 LIQUID SMALL NA           -5.757e+01  9.780e+00  -5.886
## WEEKS_SINCE_LAUNCH                          6.321e-01  1.428e-02  44.271
##                                            Pr(>|t|)    
## (Intercept)                                 < 2e-16 ***
## UNIT_SALES                                  < 2e-16 ***
## POP_SQMI                                   1.21e-07 ***
## REGIONCALI_NEVADA                          2.89e-07 ***
## REGIONCOLORADO                              < 2e-16 ***
## REGIONDESERT_SW                            0.899340    
## REGIONKANSAS                                < 2e-16 ***
## REGIONMOUNTAIN                              < 2e-16 ***
## REGIONNEWMEXICO                             < 2e-16 ***
## REGIONNOCAL                                0.005783 ** 
## REGIONNORTHERN                              < 2e-16 ***
## REGIONPRAIRIE                              2.82e-12 ***
## REGIONSOCAL                                1.05e-09 ***
## MONTH                                      0.465285    
## SEASONSPRING                               0.052863 .  
## SEASONSUMMER                               5.20e-05 ***
## SEASONWINTER                               3.87e-14 ***
## PACKAGE2CUP 12 LIQUID SMALL X12 NA          < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X18 NA          < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X20 NA          < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X24 NA          < 2e-16 ***
## PACKAGE2CUPDY PLUM  JUG 24 LIQUID SMALL NA 0.743719    
## PACKAGE2JUG 12 LIQUID SMALL NA             0.052742 .  
## PACKAGE2JUG 12 LIQUID SMALL X24 NA         8.65e-10 ***
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA        1.23e-07 ***
## PACKAGE2JUG 20 LIQUID SMALL NA             0.000616 ***
## PACKAGE2JUG 67.6 LIQUID SMALL NA           3.96e-09 ***
## WEEKS_SINCE_LAUNCH                          < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 162 on 108858 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.755 
## F-statistic: 1.243e+04 on 27 and 108858 DF,  p-value: < 2.2e-16
# Creating an 'innovation' data frame

model <- lm(UNIT_SALES ~ DOLLAR_SALES + PACKAGE + POP_SQMI + REGION   + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
summary(model)
## 
## Call:
## lm(formula = UNIT_SALES ~ DOLLAR_SALES + PACKAGE + POP_SQMI + 
##     REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, 
##     data = plum)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1269.04   -17.40    -2.92     8.59  1866.48 
## 
## Coefficients: (10 not defined because of singularities)
##                                              Estimate Std. Error t value
## (Intercept)                                 1.401e+01  1.146e+00  12.232
## DOLLAR_SALES                                2.279e-01  4.288e-04 531.591
## PACKAGE12SMALL 12ONE CUP                   -2.518e+00  9.556e-01  -2.635
## PACKAGE12SMALL 18ONE CUP                   -1.101e+01  1.494e+00  -7.370
## PACKAGE12SMALL 20ONE CUP                   -1.446e+01  2.240e+00  -6.455
## PACKAGE12SMALL 24ONE CUP                   -5.018e+01  1.219e+00 -41.158
## PACKAGE12SMALL 24ONE PLASTICS JUG          -8.691e+00  2.954e+00  -2.942
## PACKAGE12SMALL 6ONE CUP                     8.009e+00  2.769e+00   2.892
## PACKAGE12SMALL MLT PLASTICS JUG             2.241e+01  1.042e+00  21.511
## PACKAGE20SMALL MULTI JUG                    3.427e+01  9.721e-01  35.256
## PACKAGE24SMALL MLT SHADYES JUG              1.183e+01  2.183e+00   5.421
## PACKAGE2L MULTI JUG                         4.033e+01  9.746e-01  41.386
## POP_SQMI                                    1.420e-03  8.003e-05  17.740
## REGIONCALI_NEVADA                          -1.223e+01  8.002e-01 -15.283
## REGIONCOLORADO                             -1.016e+01  4.423e-01 -22.962
## REGIONDESERT_SW                            -3.747e+00  5.913e-01  -6.337
## REGIONKANSAS                               -1.066e+01  1.064e+00 -10.026
## REGIONMOUNTAIN                             -1.536e+00  5.272e-01  -2.915
## REGIONNEWMEXICO                            -1.105e+01  6.873e-01 -16.072
## REGIONNOCAL                                -4.461e+00  6.335e-01  -7.042
## REGIONNORTHERN                             -6.514e+00  4.535e-01 -14.362
## REGIONPRAIRIE                              -3.686e-01  9.672e-01  -0.381
## REGIONSOCAL                                 4.021e+00  4.605e-01   8.732
## MONTH                                       1.598e-01  4.696e-02   3.403
## SEASONSPRING                               -4.214e+00  4.696e-01  -8.975
## SEASONSUMMER                               -5.857e+00  4.368e-01 -13.408
## SEASONWINTER                               -7.436e+00  4.389e-01 -16.943
## PACKAGE2CUP 12 LIQUID SMALL X12 NA                 NA         NA      NA
## PACKAGE2CUP 12 LIQUID SMALL X18 NA                 NA         NA      NA
## PACKAGE2CUP 12 LIQUID SMALL X20 NA                 NA         NA      NA
## PACKAGE2CUP 12 LIQUID SMALL X24 NA                 NA         NA      NA
## PACKAGE2CUPDY PLUM  JUG 24 LIQUID SMALL NA         NA         NA      NA
## PACKAGE2JUG 12 LIQUID SMALL NA                     NA         NA      NA
## PACKAGE2JUG 12 LIQUID SMALL X24 NA                 NA         NA      NA
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA                NA         NA      NA
## PACKAGE2JUG 20 LIQUID SMALL NA                     NA         NA      NA
## PACKAGE2JUG 67.6 LIQUID SMALL NA                   NA         NA      NA
## WEEKS_SINCE_LAUNCH                         -2.029e-01  3.816e-03 -53.184
##                                            Pr(>|t|)    
## (Intercept)                                 < 2e-16 ***
## DOLLAR_SALES                                < 2e-16 ***
## PACKAGE12SMALL 12ONE CUP                   0.008408 ** 
## PACKAGE12SMALL 18ONE CUP                   1.72e-13 ***
## PACKAGE12SMALL 20ONE CUP                   1.08e-10 ***
## PACKAGE12SMALL 24ONE CUP                    < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG          0.003259 ** 
## PACKAGE12SMALL 6ONE CUP                    0.003827 ** 
## PACKAGE12SMALL MLT PLASTICS JUG             < 2e-16 ***
## PACKAGE20SMALL MULTI JUG                    < 2e-16 ***
## PACKAGE24SMALL MLT SHADYES JUG             5.93e-08 ***
## PACKAGE2L MULTI JUG                         < 2e-16 ***
## POP_SQMI                                    < 2e-16 ***
## REGIONCALI_NEVADA                           < 2e-16 ***
## REGIONCOLORADO                              < 2e-16 ***
## REGIONDESERT_SW                            2.35e-10 ***
## REGIONKANSAS                                < 2e-16 ***
## REGIONMOUNTAIN                             0.003562 ** 
## REGIONNEWMEXICO                             < 2e-16 ***
## REGIONNOCAL                                1.91e-12 ***
## REGIONNORTHERN                              < 2e-16 ***
## REGIONPRAIRIE                              0.703160    
## REGIONSOCAL                                 < 2e-16 ***
## MONTH                                      0.000666 ***
## SEASONSPRING                                < 2e-16 ***
## SEASONSUMMER                                < 2e-16 ***
## SEASONWINTER                                < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X12 NA               NA    
## PACKAGE2CUP 12 LIQUID SMALL X18 NA               NA    
## PACKAGE2CUP 12 LIQUID SMALL X20 NA               NA    
## PACKAGE2CUP 12 LIQUID SMALL X24 NA               NA    
## PACKAGE2CUPDY PLUM  JUG 24 LIQUID SMALL NA       NA    
## PACKAGE2JUG 12 LIQUID SMALL NA                   NA    
## PACKAGE2JUG 12 LIQUID SMALL X24 NA               NA    
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA              NA    
## PACKAGE2JUG 20 LIQUID SMALL NA                   NA    
## PACKAGE2JUG 67.6 LIQUID SMALL NA                 NA    
## WEEKS_SINCE_LAUNCH                          < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.45 on 108858 degrees of freedom
## Multiple R-squared:  0.7409, Adjusted R-squared:  0.7408 
## F-statistic: 1.153e+04 on 27 and 108858 DF,  p-value: < 2.2e-16
# Creating an 'innovation' data frame

model <- lm(UNIT_SALES ~  CALORIC_SEGMENT + PACKAGE + POP_SQMI + REGION   + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
summary(model)
## 
## Call:
## lm(formula = UNIT_SALES ~ CALORIC_SEGMENT + PACKAGE + POP_SQMI + 
##     REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, 
##     data = plum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -139.95  -39.95  -17.33   12.65 3062.52 
## 
## Coefficients: (10 not defined because of singularities)
##                                              Estimate Std. Error t value
## (Intercept)                                -1.055e+01  2.289e+00  -4.608
## CALORIC_SEGMENT                             3.402e+01  7.918e-01  42.963
## PACKAGE12SMALL 12ONE CUP                    5.193e+01  1.791e+00  28.994
## PACKAGE12SMALL 18ONE CUP                   -1.486e+00  2.810e+00  -0.529
## PACKAGE12SMALL 20ONE CUP                   -2.962e+00  4.212e+00  -0.703
## PACKAGE12SMALL 24ONE CUP                    3.805e+01  2.271e+00  16.752
## PACKAGE12SMALL 24ONE PLASTICS JUG          -4.557e+00  5.554e+00  -0.820
## PACKAGE12SMALL 6ONE CUP                    -1.798e+01  5.207e+00  -3.454
## PACKAGE12SMALL MLT PLASTICS JUG             1.954e+01  1.959e+00   9.976
## PACKAGE20SMALL MULTI JUG                    5.010e+01  1.827e+00  27.421
## PACKAGE24SMALL MLT SHADYES JUG              5.998e-01  4.104e+00   0.146
## PACKAGE2L MULTI JUG                         6.015e+01  1.839e+00  32.703
## POP_SQMI                                    3.627e-03  1.503e-04  24.130
## REGIONCALI_NEVADA                          -3.425e+01  1.505e+00 -22.762
## REGIONCOLORADO                             -8.409e+00  8.324e-01 -10.103
## REGIONDESERT_SW                            -1.531e+01  1.112e+00 -13.759
## REGIONKANSAS                                6.144e+01  1.983e+00  30.978
## REGIONMOUNTAIN                              1.591e+01  9.910e-01  16.050
## REGIONNEWMEXICO                            -1.301e+01  1.294e+00 -10.052
## REGIONNOCAL                                -2.429e+01  1.192e+00 -20.382
## REGIONNORTHERN                             -3.342e+00  8.672e-01  -3.853
## REGIONPRAIRIE                               1.636e+01  1.819e+00   8.998
## REGIONSOCAL                                 3.244e+00  8.680e-01   3.737
## MONTH                                       4.880e-01  8.830e-02   5.527
## SEASONSPRING                               -1.274e+01  8.826e-01 -14.436
## SEASONSUMMER                               -1.448e+01  8.211e-01 -17.636
## SEASONWINTER                               -1.657e+01  8.247e-01 -20.091
## PACKAGE2CUP 12 LIQUID SMALL X12 NA                 NA         NA      NA
## PACKAGE2CUP 12 LIQUID SMALL X18 NA                 NA         NA      NA
## PACKAGE2CUP 12 LIQUID SMALL X20 NA                 NA         NA      NA
## PACKAGE2CUP 12 LIQUID SMALL X24 NA                 NA         NA      NA
## PACKAGE2CUPDY PLUM  JUG 24 LIQUID SMALL NA         NA         NA      NA
## PACKAGE2JUG 12 LIQUID SMALL NA                     NA         NA      NA
## PACKAGE2JUG 12 LIQUID SMALL X24 NA                 NA         NA      NA
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA                NA         NA      NA
## PACKAGE2JUG 20 LIQUID SMALL NA                     NA         NA      NA
## PACKAGE2JUG 67.6 LIQUID SMALL NA                   NA         NA      NA
## WEEKS_SINCE_LAUNCH                         -1.885e-01  7.195e-03 -26.200
##                                            Pr(>|t|)    
## (Intercept)                                4.06e-06 ***
## CALORIC_SEGMENT                             < 2e-16 ***
## PACKAGE12SMALL 12ONE CUP                    < 2e-16 ***
## PACKAGE12SMALL 18ONE CUP                   0.596812    
## PACKAGE12SMALL 20ONE CUP                   0.481934    
## PACKAGE12SMALL 24ONE CUP                    < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG          0.411970    
## PACKAGE12SMALL 6ONE CUP                    0.000553 ***
## PACKAGE12SMALL MLT PLASTICS JUG             < 2e-16 ***
## PACKAGE20SMALL MULTI JUG                    < 2e-16 ***
## PACKAGE24SMALL MLT SHADYES JUG             0.883803    
## PACKAGE2L MULTI JUG                         < 2e-16 ***
## POP_SQMI                                    < 2e-16 ***
## REGIONCALI_NEVADA                           < 2e-16 ***
## REGIONCOLORADO                              < 2e-16 ***
## REGIONDESERT_SW                             < 2e-16 ***
## REGIONKANSAS                                < 2e-16 ***
## REGIONMOUNTAIN                              < 2e-16 ***
## REGIONNEWMEXICO                             < 2e-16 ***
## REGIONNOCAL                                 < 2e-16 ***
## REGIONNORTHERN                             0.000117 ***
## REGIONPRAIRIE                               < 2e-16 ***
## REGIONSOCAL                                0.000186 ***
## MONTH                                      3.27e-08 ***
## SEASONSPRING                                < 2e-16 ***
## SEASONSUMMER                                < 2e-16 ***
## SEASONWINTER                                < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X12 NA               NA    
## PACKAGE2CUP 12 LIQUID SMALL X18 NA               NA    
## PACKAGE2CUP 12 LIQUID SMALL X20 NA               NA    
## PACKAGE2CUP 12 LIQUID SMALL X24 NA               NA    
## PACKAGE2CUPDY PLUM  JUG 24 LIQUID SMALL NA       NA    
## PACKAGE2JUG 12 LIQUID SMALL NA                   NA    
## PACKAGE2JUG 12 LIQUID SMALL X24 NA               NA    
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA              NA    
## PACKAGE2JUG 20 LIQUID SMALL NA                   NA    
## PACKAGE2JUG 67.6 LIQUID SMALL NA                 NA    
## WEEKS_SINCE_LAUNCH                          < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.71 on 108858 degrees of freedom
## Multiple R-squared:  0.08372,    Adjusted R-squared:  0.08349 
## F-statistic: 368.4 on 27 and 108858 DF,  p-value: < 2.2e-16
#weekly sales where ITEM contains "RAINING THRASHED PLUM"
plum %>%
  filter(str_detect(ITEM, "RAINING  THRASHED PLUM")) %>%
  group_by(DATE) %>%
  summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
  ggplot(aes(x = DATE, y = UNIT_SALES)) +
  geom_line() +
  labs(title = "Weekly Sales of 'RAINING THRASHED PLUM'", x = "Week of Year", y = "Unit Sales")

# Load and prepare dataset
df <- read.csv("plum_tableau.csv") 
# Load and prepare dataset 

str(df)
## 'data.frame':    108886 obs. of  19 variables:
##  $ MARKET_KEY          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE                : chr  "2022-06-18" "2022-04-30" "2021-12-11" "2022-07-30" ...
##  $ CALORIC_SEGMENT     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY            : chr  "SSD" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES          : int  1 14 18 13 19 4 29 35 75 25 ...
##  $ DOLLAR_SALES        : num  4.62 86.86 89.73 65.6 72.93 ...
##  $ MANUFACTURER        : chr  "JOLLYS" "JOLLYS" "JOLLYS" "JOLLYS" ...
##  $ BRAND               : chr  "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
##  $ PACKAGE             : chr  "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
##  $ ITEM                : chr  "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" ...
##  $ POP_SQMI            : num  1.2 1.2 1.2 1.2 1.2 ...
##  $ REGION              : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ MONTH               : int  6 4 12 7 11 6 1 11 10 12 ...
##  $ SEASON              : chr  "SUMMER" "SPRING" "WINTER" "SUMMER" ...
##  $ PACKAGE2            : chr  "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
##  $ ENERGY_DRINK        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CALORIC_SEGMENT_TEXT: chr  NA NA NA NA ...
##  $ min_launch_date     : chr  "2021-09-04" "2021-09-04" "2021-09-04" "2021-09-04" ...
##  $ WEEKS_SINCE_LAUNCH  : int  41 34 14 47 12 40 20 9 6 13 ...
plum <- df %>% 
  #select(-DATE, -MONTH, -SEASON, -BRAND, -REGION, -ITEM )
  select(-MONTH, -SEASON, -min_launch_date, -PACKAGE2, -CALORIC_SEGMENT_TEXT)
print(unique(plum$ITEM))
## [1] "RAINING  THRASHED PLUM"        "BEAUTIFUL GREENER  PLUM"      
## [3] " SMASH  SUNSET"                "FANTASMIC  PLUM"              
## [5] "ZIZZLES  PLUM"                 "GO-DAY "                      
## [7] "FANTASMIC  CUSTARD APPLE PLUM"
print(unique(plum$BRAND))
## [1] "HILL MOISTURE THRASHED APPLE" "BEAUTIFUL GREENER"           
## [3] "DIET SMASH"                   "FANTASMIC"                   
## [5] "SINGLE GROUP"                 "GO-DAY"
print(unique(plum$CATEGORY))
## [1] "SSD"
print(unique(plum$PACKAGE))
##  [1] "12SMALL 12ONE CUP"          "20SMALL MULTI JUG"         
##  [3] "2L MULTI JUG"               "12SMALL 6ONE CUP"          
##  [5] "12SMALL 24ONE CUP"          "12SMALL MLT PLASTICS JUG"  
##  [7] "12SMALL 20ONE CUP"          ".5L 6ONE JUG"              
##  [9] "12SMALL 18ONE CUP"          "12SMALL 24ONE PLASTICS JUG"
## [11] "24SMALL MLT SHADYES JUG"
# Assuming plum is your data frame and PACKAGING is the column of interest

# Create new columns in woodsy for each unique substring
# Each column will have a 1 if the substring is found in the PACKAGING column, 0 otherwise

plum$`12SMALL 12ONE CUP` = as.integer(grepl("12SMALL 12ONE CUP", plum$PACKAGE))
plum$`20SMALL MULTI JUG` = as.integer(grepl("20SMALL MULTI JUG", plum$PACKAGE))
plum$`12SMALL 6ONE CUP` = as.integer(grepl("12SMALL 6ONE CUP", plum$PACKAGE))
plum$`12SMALL 24ONE CUP` = as.integer(grepl("12SMALL 24ONE CUP", plum$PACKAGE))
plum$`12SMALL MLT PLASTICS JUG` = as.integer(grepl("12SMALL MLT PLASTICS JUG", plum$PACKAGE))
plum$`.5L 6ONE JUG` = as.integer(grepl(".5L 6ONE JUG", plum$PACKAGE))
plum$`12SMALL 20ONE CUP` = as.integer(grepl("12SMALL 20ONE CUP", plum$PACKAGE))
plum$`12SMALL 18ONE CUP` = as.integer(grepl("12SMALL 18ONE CUP", plum$PACKAGE))
plum$`12SMALL 24ONE PLASTICS JUG` = as.integer(grepl("12SMALL 24ONE PLASTICS JUG", plum$PACKAGE))

     

#one hot encode non brand ITEM strings
#"RAINING THRASHED PLUM"  "BEAUTIFUL GREENER PLUM" "ZIZZLES PLUM"     
plum$`RAINING THRASHED PLUM` = as.integer(grepl("RAINING THRASHED PLUM", plum$ITEM))
plum$`BEAUTIFUL GREENER PLUM` = as.integer(grepl("BEAUTIFUL GREENER PLUM", plum$ITEM))
plum$`ZIZZLES PLUM` = as.integer(grepl("ZIZZLES PLUM", plum$ITEM))

# Print the head of the data frame to see the first few rows
head(plum)
##   MARKET_KEY       DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1          1 2022-06-18               1      SSD          1         4.62
## 2          1 2022-04-30               1      SSD         14        86.86
## 3          1 2021-12-11               1      SSD         18        89.73
## 4          1 2022-07-30               1      SSD         13        65.60
## 5          1 2021-11-27               1      SSD         19        72.93
## 6          1 2022-06-11               1      SSD          4        25.60
##   MANUFACTURER                        BRAND           PACKAGE
## 1       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 2       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 3       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 4       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 5       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 6       JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
##                     ITEM POP_SQMI   REGION ENERGY_DRINK WEEKS_SINCE_LAUNCH
## 1 RAINING  THRASHED PLUM 1.201114 NORTHERN            0                 41
## 2 RAINING  THRASHED PLUM 1.201114 NORTHERN            0                 34
## 3 RAINING  THRASHED PLUM 1.201114 NORTHERN            0                 14
## 4 RAINING  THRASHED PLUM 1.201114 NORTHERN            0                 47
## 5 RAINING  THRASHED PLUM 1.201114 NORTHERN            0                 12
## 6 RAINING  THRASHED PLUM 1.201114 NORTHERN            0                 40
##   12SMALL 12ONE CUP 20SMALL MULTI JUG 12SMALL 6ONE CUP 12SMALL 24ONE CUP
## 1                 1                 0                0                 0
## 2                 1                 0                0                 0
## 3                 1                 0                0                 0
## 4                 1                 0                0                 0
## 5                 1                 0                0                 0
## 6                 1                 0                0                 0
##   12SMALL MLT PLASTICS JUG .5L 6ONE JUG 12SMALL 20ONE CUP 12SMALL 18ONE CUP
## 1                        0            0                 0                 0
## 2                        0            0                 0                 0
## 3                        0            0                 0                 0
## 4                        0            0                 0                 0
## 5                        0            0                 0                 0
## 6                        0            0                 0                 0
##   12SMALL 24ONE PLASTICS JUG RAINING THRASHED PLUM BEAUTIFUL GREENER PLUM
## 1                          0                     0                      0
## 2                          0                     0                      0
## 3                          0                     0                      0
## 4                          0                     0                      0
## 5                          0                     0                      0
## 6                          0                     0                      0
##   ZIZZLES PLUM
## 1            0
## 2            0
## 3            0
## 4            0
## 5            0
## 6            0
plum$CATEGORY <- NULL
plum$MARKET_KEY <- NULL
plum$MANUFACTURER <- NULL
plum$PACKAGE <- NULL
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
# One-hot encode the specified columns
plum <- fastDummies::dummy_cols(plum, select_columns = c("REGION", "ITEM"), remove_selected_columns = TRUE)
#plum <- fastDummies::dummy_cols(plum, select_columns = c("REGION", "SEASON","ITEM"), remove_selected_columns = TRUE)

# View the first few rows to verify the changes
head(plum)
##         DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES
## 1 2022-06-18               1          1         4.62
## 2 2022-04-30               1         14        86.86
## 3 2021-12-11               1         18        89.73
## 4 2022-07-30               1         13        65.60
## 5 2021-11-27               1         19        72.93
## 6 2022-06-11               1          4        25.60
##                          BRAND POP_SQMI ENERGY_DRINK WEEKS_SINCE_LAUNCH
## 1 HILL MOISTURE THRASHED APPLE 1.201114            0                 41
## 2 HILL MOISTURE THRASHED APPLE 1.201114            0                 34
## 3 HILL MOISTURE THRASHED APPLE 1.201114            0                 14
## 4 HILL MOISTURE THRASHED APPLE 1.201114            0                 47
## 5 HILL MOISTURE THRASHED APPLE 1.201114            0                 12
## 6 HILL MOISTURE THRASHED APPLE 1.201114            0                 40
##   12SMALL 12ONE CUP 20SMALL MULTI JUG 12SMALL 6ONE CUP 12SMALL 24ONE CUP
## 1                 1                 0                0                 0
## 2                 1                 0                0                 0
## 3                 1                 0                0                 0
## 4                 1                 0                0                 0
## 5                 1                 0                0                 0
## 6                 1                 0                0                 0
##   12SMALL MLT PLASTICS JUG .5L 6ONE JUG 12SMALL 20ONE CUP 12SMALL 18ONE CUP
## 1                        0            0                 0                 0
## 2                        0            0                 0                 0
## 3                        0            0                 0                 0
## 4                        0            0                 0                 0
## 5                        0            0                 0                 0
## 6                        0            0                 0                 0
##   12SMALL 24ONE PLASTICS JUG RAINING THRASHED PLUM BEAUTIFUL GREENER PLUM
## 1                          0                     0                      0
## 2                          0                     0                      0
## 3                          0                     0                      0
## 4                          0                     0                      0
## 5                          0                     0                      0
## 6                          0                     0                      0
##   ZIZZLES PLUM REGION_ARIZONA REGION_CALI_NEVADA REGION_COLORADO
## 1            0              0                  0               0
## 2            0              0                  0               0
## 3            0              0                  0               0
## 4            0              0                  0               0
## 5            0              0                  0               0
## 6            0              0                  0               0
##   REGION_DESERT_SW REGION_KANSAS REGION_MOUNTAIN REGION_NEWMEXICO REGION_NOCAL
## 1                0             0               0                0            0
## 2                0             0               0                0            0
## 3                0             0               0                0            0
## 4                0             0               0                0            0
## 5                0             0               0                0            0
## 6                0             0               0                0            0
##   REGION_NORTHERN REGION_PRAIRIE REGION_SOCAL ITEM_ SMASH  SUNSET
## 1               1              0            0                   0
## 2               1              0            0                   0
## 3               1              0            0                   0
## 4               1              0            0                   0
## 5               1              0            0                   0
## 6               1              0            0                   0
##   ITEM_BEAUTIFUL GREENER  PLUM ITEM_FANTASMIC  CUSTARD APPLE PLUM
## 1                            0                                  0
## 2                            0                                  0
## 3                            0                                  0
## 4                            0                                  0
## 5                            0                                  0
## 6                            0                                  0
##   ITEM_FANTASMIC  PLUM ITEM_GO-DAY  ITEM_RAINING  THRASHED PLUM
## 1                    0            0                           1
## 2                    0            0                           1
## 3                    0            0                           1
## 4                    0            0                           1
## 5                    0            0                           1
## 6                    0            0                           1
##   ITEM_ZIZZLES  PLUM
## 1                  0
## 2                  0
## 3                  0
## 4                  0
## 5                  0
## 6                  0
write.csv(plum, "plum_one_hot.csv", row.names = FALSE)
library(fastDummies)

# One-hot encode
plum <- fastDummies::dummy_cols(plum, select_columns = "BRAND", remove_selected_columns = TRUE)

# View the first few rows to verify
head(plum)
##         DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES POP_SQMI ENERGY_DRINK
## 1 2022-06-18               1          1         4.62 1.201114            0
## 2 2022-04-30               1         14        86.86 1.201114            0
## 3 2021-12-11               1         18        89.73 1.201114            0
## 4 2022-07-30               1         13        65.60 1.201114            0
## 5 2021-11-27               1         19        72.93 1.201114            0
## 6 2022-06-11               1          4        25.60 1.201114            0
##   WEEKS_SINCE_LAUNCH 12SMALL 12ONE CUP 20SMALL MULTI JUG 12SMALL 6ONE CUP
## 1                 41                 1                 0                0
## 2                 34                 1                 0                0
## 3                 14                 1                 0                0
## 4                 47                 1                 0                0
## 5                 12                 1                 0                0
## 6                 40                 1                 0                0
##   12SMALL 24ONE CUP 12SMALL MLT PLASTICS JUG .5L 6ONE JUG 12SMALL 20ONE CUP
## 1                 0                        0            0                 0
## 2                 0                        0            0                 0
## 3                 0                        0            0                 0
## 4                 0                        0            0                 0
## 5                 0                        0            0                 0
## 6                 0                        0            0                 0
##   12SMALL 18ONE CUP 12SMALL 24ONE PLASTICS JUG RAINING THRASHED PLUM
## 1                 0                          0                     0
## 2                 0                          0                     0
## 3                 0                          0                     0
## 4                 0                          0                     0
## 5                 0                          0                     0
## 6                 0                          0                     0
##   BEAUTIFUL GREENER PLUM ZIZZLES PLUM REGION_ARIZONA REGION_CALI_NEVADA
## 1                      0            0              0                  0
## 2                      0            0              0                  0
## 3                      0            0              0                  0
## 4                      0            0              0                  0
## 5                      0            0              0                  0
## 6                      0            0              0                  0
##   REGION_COLORADO REGION_DESERT_SW REGION_KANSAS REGION_MOUNTAIN
## 1               0                0             0               0
## 2               0                0             0               0
## 3               0                0             0               0
## 4               0                0             0               0
## 5               0                0             0               0
## 6               0                0             0               0
##   REGION_NEWMEXICO REGION_NOCAL REGION_NORTHERN REGION_PRAIRIE REGION_SOCAL
## 1                0            0               1              0            0
## 2                0            0               1              0            0
## 3                0            0               1              0            0
## 4                0            0               1              0            0
## 5                0            0               1              0            0
## 6                0            0               1              0            0
##   ITEM_ SMASH  SUNSET ITEM_BEAUTIFUL GREENER  PLUM
## 1                   0                            0
## 2                   0                            0
## 3                   0                            0
## 4                   0                            0
## 5                   0                            0
## 6                   0                            0
##   ITEM_FANTASMIC  CUSTARD APPLE PLUM ITEM_FANTASMIC  PLUM ITEM_GO-DAY 
## 1                                  0                    0            0
## 2                                  0                    0            0
## 3                                  0                    0            0
## 4                                  0                    0            0
## 5                                  0                    0            0
## 6                                  0                    0            0
##   ITEM_RAINING  THRASHED PLUM ITEM_ZIZZLES  PLUM BRAND_BEAUTIFUL GREENER
## 1                           1                  0                       0
## 2                           1                  0                       0
## 3                           1                  0                       0
## 4                           1                  0                       0
## 5                           1                  0                       0
## 6                           1                  0                       0
##   BRAND_DIET SMASH BRAND_FANTASMIC BRAND_GO-DAY
## 1                0               0            0
## 2                0               0            0
## 3                0               0            0
## 4                0               0            0
## 5                0               0            0
## 6                0               0            0
##   BRAND_HILL MOISTURE THRASHED APPLE BRAND_SINGLE GROUP
## 1                                  1                  0
## 2                                  1                  0
## 3                                  1                  0
## 4                                  1                  0
## 5                                  1                  0
## 6                                  1                  0
#create new week of year column

plum <- plum %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK_OF_YEAR = lubridate::week(DATE))

#Drop DATE column
plum$DATE <- NULL
# Summarize the dataset
skimr::skim(plum)
Data summary
Name plum
Number of rows 108886
Number of columns 43
_______________________
Column type frequency:
numeric 43
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.86 0.34 0.00 1.00 1.00 1.00 1.00 ▁▁▁▁▇
UNIT_SALES 0 1 51.24 85.35 1.00 9.00 25.00 58.00 3157.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 171.84 327.25 0.31 27.12 74.65 184.04 12763.23 ▇▁▁▁▁
POP_SQMI 0 1 1723.52 1926.89 0.18 57.10 843.08 3191.93 6769.35 ▇▂▂▁▁
ENERGY_DRINK 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
WEEKS_SINCE_LAUNCH 0 1 59.78 35.73 0.00 30.00 58.00 87.00 128.00 ▇▇▇▆▆
12SMALL 12ONE CUP 0 1 0.44 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
20SMALL MULTI JUG 0 1 0.21 0.41 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
12SMALL 6ONE CUP 0 1 0.00 0.05 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
12SMALL 24ONE CUP 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
12SMALL MLT PLASTICS JUG 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
.5L 6ONE JUG 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
12SMALL 20ONE CUP 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
12SMALL 18ONE CUP 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
12SMALL 24ONE PLASTICS JUG 0 1 0.00 0.05 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
RAINING THRASHED PLUM 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
BEAUTIFUL GREENER PLUM 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
ZIZZLES PLUM 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
REGION_ARIZONA 0 1 0.28 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
REGION_CALI_NEVADA 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_COLORADO 0 1 0.13 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_DESERT_SW 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_KANSAS 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_MOUNTAIN 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_NEWMEXICO 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_NOCAL 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_NORTHERN 0 1 0.13 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_PRAIRIE 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_SOCAL 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_ SMASH SUNSET 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_BEAUTIFUL GREENER PLUM 0 1 0.59 0.49 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
ITEM_FANTASMIC CUSTARD APPLE PLUM 0 1 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_FANTASMIC PLUM 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_GO-DAY 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_RAINING THRASHED PLUM 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
ITEM_ZIZZLES PLUM 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_BEAUTIFUL GREENER 0 1 0.59 0.49 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
BRAND_DIET SMASH 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_FANTASMIC 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_GO-DAY 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_HILL MOISTURE THRASHED APPLE 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
BRAND_SINGLE GROUP 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
WEEK_OF_YEAR 0 1 25.40 15.86 1.00 11.00 23.00 40.00 53.00 ▇▇▅▅▆
#remove top 10 percent of unit sales to clean up outliers
df <- plum %>% 
  filter(UNIT_SALES < quantile(UNIT_SALES, 0.99))
# Split the data
set.seed(123)
df_testtrn <- initial_split(df, prop = 0.8, strata = UNIT_SALES)
Train <- training(df_testtrn)
Test <- testing(df_testtrn)

# Prepare features and labels for XGBoost
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES

# Convert data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Define XGBoost parameters
set.seed(123)
params <- list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eval_metric = "rmse",
  eta = 0.05,
  max_depth = 4,
  min_child_weight = 3,
  subsample = 0.7,
  colsample_bytree = 0.6,
  lambda = 1,
  alpha = 1
)
# Perform cross-validation to find the optimal number of boosting rounds
cv_results <- xgb.cv(
  params = params,
  data = dtrain,  
  nfold = 5,
  nrounds = 500,  # Changed from 'num_boost_round' to 'nrounds'
  early_stopping_rounds = 10,
  metrics = "rmse",
  seed = 123
)
## [1]  train-rmse:72.137771+0.694189   test-rmse:72.122812+1.667395 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [2]  train-rmse:69.255785+1.419294   test-rmse:69.241268+2.195574 
## [3]  train-rmse:66.960070+1.396622   test-rmse:66.952155+1.992800 
## [4]  train-rmse:65.320298+1.809715   test-rmse:65.317026+2.342574 
## [5]  train-rmse:63.264055+1.898804   test-rmse:63.253139+2.160364 
## [6]  train-rmse:60.868240+1.911852   test-rmse:60.873311+2.366105 
## [7]  train-rmse:58.958888+2.123034   test-rmse:58.995136+2.756431 
## [8]  train-rmse:57.365471+1.877309   test-rmse:57.416698+2.755472 
## [9]  train-rmse:55.350107+1.948074   test-rmse:55.387263+2.507748 
## [10] train-rmse:53.003066+1.855188   test-rmse:53.046293+2.402330 
## [11] train-rmse:51.623821+2.252422   test-rmse:51.666774+2.642721 
## [12] train-rmse:50.048931+2.282919   test-rmse:50.086026+2.464909 
## [13] train-rmse:48.162787+2.171677   test-rmse:48.206389+2.344030 
## [14] train-rmse:47.255850+2.153060   test-rmse:47.301651+2.274130 
## [15] train-rmse:46.203406+2.591040   test-rmse:46.258844+2.674439 
## [16] train-rmse:44.504130+2.471292   test-rmse:44.556336+2.514257 
## [17] train-rmse:43.386682+2.152247   test-rmse:43.442540+2.358840 
## [18] train-rmse:41.733663+1.967301   test-rmse:41.793139+2.187208 
## [19] train-rmse:40.998679+1.839151   test-rmse:41.054409+1.981375 
## [20] train-rmse:39.682598+1.425809   test-rmse:39.735369+1.586714 
## [21] train-rmse:38.904505+1.402720   test-rmse:38.943172+1.380490 
## [22] train-rmse:37.740772+1.110022   test-rmse:37.778690+1.041240 
## [23] train-rmse:36.940487+1.393524   test-rmse:36.980743+1.226831 
## [24] train-rmse:35.712688+1.344147   test-rmse:35.753979+1.143263 
## [25] train-rmse:35.202848+1.631906   test-rmse:35.250355+1.496182 
## [26] train-rmse:34.344661+1.829583   test-rmse:34.389204+1.523998 
## [27] train-rmse:33.331308+1.708773   test-rmse:33.384636+1.537119 
## [28] train-rmse:32.601322+1.741031   test-rmse:32.647010+1.370052 
## [29] train-rmse:32.030753+1.715564   test-rmse:32.089619+1.563183 
## [30] train-rmse:31.371337+1.619875   test-rmse:31.442856+1.634927 
## [31] train-rmse:30.454055+1.614446   test-rmse:30.531082+1.716651 
## [32] train-rmse:29.951549+1.911215   test-rmse:30.031847+1.955978 
## [33] train-rmse:29.125629+1.925990   test-rmse:29.216651+2.001208 
## [34] train-rmse:28.362703+1.960172   test-rmse:28.461554+2.093729 
## [35] train-rmse:28.028859+1.936583   test-rmse:28.125461+2.013657 
## [36] train-rmse:27.417881+1.660393   test-rmse:27.518401+1.733058 
## [37] train-rmse:26.985197+1.717643   test-rmse:27.084313+1.692132 
## [38] train-rmse:26.578732+1.430717   test-rmse:26.687021+1.534495 
## [39] train-rmse:26.251098+1.340034   test-rmse:26.357109+1.376575 
## [40] train-rmse:25.806954+1.122894   test-rmse:25.911649+1.199642 
## [41] train-rmse:25.244082+1.252438   test-rmse:25.356067+1.334437 
## [42] train-rmse:24.745634+1.148779   test-rmse:24.864694+1.324762 
## [43] train-rmse:24.299181+1.125123   test-rmse:24.422515+1.354251 
## [44] train-rmse:23.964521+1.071489   test-rmse:24.098154+1.377127 
## [45] train-rmse:23.517820+1.078028   test-rmse:23.650329+1.293446 
## [46] train-rmse:23.303199+1.027667   test-rmse:23.445906+1.369477 
## [47] train-rmse:22.994039+0.890609   test-rmse:23.129871+1.169562 
## [48] train-rmse:22.557341+0.930225   test-rmse:22.699237+1.178580 
## [49] train-rmse:22.253992+0.928062   test-rmse:22.395832+1.235226 
## [50] train-rmse:21.773938+0.884540   test-rmse:21.916110+1.078190 
## [51] train-rmse:21.359928+0.920151   test-rmse:21.506013+1.155020 
## [52] train-rmse:21.246803+0.997746   test-rmse:21.392823+1.223455 
## [53] train-rmse:21.111708+1.119717   test-rmse:21.259855+1.337181 
## [54] train-rmse:20.836652+1.117059   test-rmse:20.985814+1.336396 
## [55] train-rmse:20.703284+1.177471   test-rmse:20.849252+1.364993 
## [56] train-rmse:20.464724+1.286973   test-rmse:20.614273+1.481153 
## [57] train-rmse:20.041332+1.151340   test-rmse:20.206083+1.364809 
## [58] train-rmse:19.749614+1.126183   test-rmse:19.915663+1.395039 
## [59] train-rmse:19.496684+1.131648   test-rmse:19.659368+1.366849 
## [60] train-rmse:19.371638+1.185437   test-rmse:19.539404+1.448636 
## [61] train-rmse:18.959551+1.122090   test-rmse:19.128390+1.350801 
## [62] train-rmse:18.876290+1.178157   test-rmse:19.046207+1.405915 
## [63] train-rmse:18.564062+1.150144   test-rmse:18.735915+1.376422 
## [64] train-rmse:18.266796+1.094644   test-rmse:18.442844+1.336064 
## [65] train-rmse:17.963045+0.979500   test-rmse:18.140928+1.253338 
## [66] train-rmse:17.715753+0.957250   test-rmse:17.892912+1.184003 
## [67] train-rmse:17.499816+0.853499   test-rmse:17.680738+1.120640 
## [68] train-rmse:17.358329+0.867539   test-rmse:17.541940+1.152322 
## [69] train-rmse:17.198595+0.870387   test-rmse:17.386982+1.163704 
## [70] train-rmse:17.052551+0.860202   test-rmse:17.242459+1.180460 
## [71] train-rmse:16.947208+0.861727   test-rmse:17.137946+1.191236 
## [72] train-rmse:16.875120+0.848341   test-rmse:17.065087+1.175685 
## [73] train-rmse:16.718737+0.812504   test-rmse:16.917857+1.150183 
## [74] train-rmse:16.555205+0.839413   test-rmse:16.753342+1.196134 
## [75] train-rmse:16.449067+0.817016   test-rmse:16.645079+1.176282 
## [76] train-rmse:16.319150+0.821313   test-rmse:16.519383+1.168047 
## [77] train-rmse:16.171130+0.846390   test-rmse:16.376551+1.211002 
## [78] train-rmse:16.035068+0.830742   test-rmse:16.248189+1.204612 
## [79] train-rmse:15.892681+0.867631   test-rmse:16.110772+1.234937 
## [80] train-rmse:15.708920+0.867126   test-rmse:15.934721+1.228622 
## [81] train-rmse:15.498828+0.804373   test-rmse:15.727228+1.168941 
## [82] train-rmse:15.308794+0.783506   test-rmse:15.533496+1.149043 
## [83] train-rmse:15.219015+0.770153   test-rmse:15.441557+1.134210 
## [84] train-rmse:15.027482+0.728806   test-rmse:15.251472+1.089898 
## [85] train-rmse:14.888443+0.701976   test-rmse:15.115715+1.047146 
## [86] train-rmse:14.773896+0.639976   test-rmse:14.999449+0.980189 
## [87] train-rmse:14.618083+0.645314   test-rmse:14.848241+0.983271 
## [88] train-rmse:14.527579+0.654372   test-rmse:14.759424+0.975085 
## [89] train-rmse:14.408054+0.660338   test-rmse:14.640384+0.968095 
## [90] train-rmse:14.294937+0.658424   test-rmse:14.523074+0.945605 
## [91] train-rmse:14.153347+0.594116   test-rmse:14.381315+0.878431 
## [92] train-rmse:14.045909+0.559802   test-rmse:14.278560+0.824898 
## [93] train-rmse:13.921658+0.546825   test-rmse:14.158136+0.816067 
## [94] train-rmse:13.837715+0.541879   test-rmse:14.075464+0.817687 
## [95] train-rmse:13.784775+0.533811   test-rmse:14.021970+0.811563 
## [96] train-rmse:13.733846+0.546830   test-rmse:13.970142+0.813271 
## [97] train-rmse:13.651351+0.545428   test-rmse:13.894688+0.801871 
## [98] train-rmse:13.560599+0.520556   test-rmse:13.802283+0.775158 
## [99] train-rmse:13.426946+0.475212   test-rmse:13.672718+0.726234 
## [100]    train-rmse:13.308250+0.463842   test-rmse:13.549346+0.741427 
## [101]    train-rmse:13.215402+0.461362   test-rmse:13.456300+0.750187 
## [102]    train-rmse:13.130064+0.439550   test-rmse:13.371356+0.715389 
## [103]    train-rmse:13.072106+0.446331   test-rmse:13.317727+0.734271 
## [104]    train-rmse:13.018722+0.441255   test-rmse:13.268797+0.731916 
## [105]    train-rmse:12.942358+0.428125   test-rmse:13.192922+0.733786 
## [106]    train-rmse:12.891616+0.418396   test-rmse:13.143847+0.732341 
## [107]    train-rmse:12.837582+0.433064   test-rmse:13.089560+0.751399 
## [108]    train-rmse:12.743748+0.399644   test-rmse:12.997436+0.727625 
## [109]    train-rmse:12.690973+0.412301   test-rmse:12.945846+0.739899 
## [110]    train-rmse:12.651858+0.409110   test-rmse:12.906227+0.743594 
## [111]    train-rmse:12.566238+0.403905   test-rmse:12.822794+0.736171 
## [112]    train-rmse:12.521378+0.389322   test-rmse:12.779115+0.730042 
## [113]    train-rmse:12.490678+0.384063   test-rmse:12.747657+0.726155 
## [114]    train-rmse:12.442329+0.357870   test-rmse:12.701307+0.694044 
## [115]    train-rmse:12.395358+0.344879   test-rmse:12.657760+0.672183 
## [116]    train-rmse:12.370905+0.337244   test-rmse:12.636029+0.661169 
## [117]    train-rmse:12.312100+0.356418   test-rmse:12.580973+0.655359 
## [118]    train-rmse:12.241180+0.348380   test-rmse:12.512167+0.640872 
## [119]    train-rmse:12.202415+0.338332   test-rmse:12.474006+0.635763 
## [120]    train-rmse:12.166751+0.330310   test-rmse:12.439790+0.617169 
## [121]    train-rmse:12.114123+0.315740   test-rmse:12.386903+0.589415 
## [122]    train-rmse:12.075297+0.307016   test-rmse:12.348859+0.586133 
## [123]    train-rmse:12.008850+0.312077   test-rmse:12.284926+0.588376 
## [124]    train-rmse:11.927500+0.281543   test-rmse:12.203921+0.570579 
## [125]    train-rmse:11.866934+0.293521   test-rmse:12.144018+0.571543 
## [126]    train-rmse:11.807208+0.291227   test-rmse:12.084435+0.585336 
## [127]    train-rmse:11.760391+0.286373   test-rmse:12.035894+0.575901 
## [128]    train-rmse:11.717724+0.284048   test-rmse:11.995136+0.565307 
## [129]    train-rmse:11.679056+0.296256   test-rmse:11.957377+0.584220 
## [130]    train-rmse:11.634195+0.281800   test-rmse:11.911041+0.574768 
## [131]    train-rmse:11.594000+0.284716   test-rmse:11.873547+0.568645 
## [132]    train-rmse:11.555011+0.277994   test-rmse:11.833736+0.567530 
## [133]    train-rmse:11.515344+0.277920   test-rmse:11.795876+0.576864 
## [134]    train-rmse:11.470954+0.266291   test-rmse:11.751557+0.569849 
## [135]    train-rmse:11.430924+0.263766   test-rmse:11.710253+0.567225 
## [136]    train-rmse:11.397417+0.277372   test-rmse:11.675422+0.574469 
## [137]    train-rmse:11.380889+0.278505   test-rmse:11.659234+0.577216 
## [138]    train-rmse:11.351831+0.271880   test-rmse:11.630256+0.560806 
## [139]    train-rmse:11.307443+0.273585   test-rmse:11.587953+0.561272 
## [140]    train-rmse:11.272888+0.274491   test-rmse:11.554291+0.559348 
## [141]    train-rmse:11.244981+0.274483   test-rmse:11.529049+0.545898 
## [142]    train-rmse:11.209663+0.274223   test-rmse:11.495535+0.552692 
## [143]    train-rmse:11.161496+0.270217   test-rmse:11.447453+0.552400 
## [144]    train-rmse:11.122443+0.276070   test-rmse:11.408279+0.562051 
## [145]    train-rmse:11.093935+0.276443   test-rmse:11.381333+0.565862 
## [146]    train-rmse:11.046198+0.268312   test-rmse:11.335569+0.565482 
## [147]    train-rmse:11.013764+0.252605   test-rmse:11.303178+0.562111 
## [148]    train-rmse:10.980656+0.241410   test-rmse:11.268562+0.545482 
## [149]    train-rmse:10.947976+0.233810   test-rmse:11.236407+0.546195 
## [150]    train-rmse:10.898912+0.229192   test-rmse:11.188985+0.540844 
## [151]    train-rmse:10.869803+0.222678   test-rmse:11.160549+0.537110 
## [152]    train-rmse:10.847249+0.219519   test-rmse:11.137853+0.532324 
## [153]    train-rmse:10.814964+0.208642   test-rmse:11.107298+0.512058 
## [154]    train-rmse:10.791077+0.200110   test-rmse:11.085474+0.508884 
## [155]    train-rmse:10.764873+0.210425   test-rmse:11.058019+0.500876 
## [156]    train-rmse:10.738742+0.214985   test-rmse:11.031225+0.494808 
## [157]    train-rmse:10.706057+0.204146   test-rmse:10.998783+0.477529 
## [158]    train-rmse:10.677813+0.209446   test-rmse:10.972833+0.474366 
## [159]    train-rmse:10.656374+0.212735   test-rmse:10.952221+0.474602 
## [160]    train-rmse:10.613637+0.203139   test-rmse:10.911608+0.459421 
## [161]    train-rmse:10.598676+0.196652   test-rmse:10.897143+0.456877 
## [162]    train-rmse:10.565520+0.187652   test-rmse:10.863735+0.452187 
## [163]    train-rmse:10.532136+0.171342   test-rmse:10.834299+0.448851 
## [164]    train-rmse:10.515833+0.171246   test-rmse:10.817336+0.448886 
## [165]    train-rmse:10.489871+0.168680   test-rmse:10.790682+0.438450 
## [166]    train-rmse:10.463881+0.174288   test-rmse:10.768097+0.430730 
## [167]    train-rmse:10.431808+0.169665   test-rmse:10.737675+0.427054 
## [168]    train-rmse:10.413617+0.168289   test-rmse:10.719041+0.434992 
## [169]    train-rmse:10.400884+0.170443   test-rmse:10.707433+0.435743 
## [170]    train-rmse:10.381628+0.163730   test-rmse:10.689899+0.430353 
## [171]    train-rmse:10.357298+0.156236   test-rmse:10.664782+0.428333 
## [172]    train-rmse:10.340560+0.156245   test-rmse:10.648360+0.429387 
## [173]    train-rmse:10.324111+0.154571   test-rmse:10.633826+0.430920 
## [174]    train-rmse:10.304054+0.159555   test-rmse:10.611082+0.429386 
## [175]    train-rmse:10.292230+0.158318   test-rmse:10.600974+0.426826 
## [176]    train-rmse:10.271419+0.153204   test-rmse:10.580396+0.423563 
## [177]    train-rmse:10.250217+0.144801   test-rmse:10.560666+0.425360 
## [178]    train-rmse:10.236630+0.150307   test-rmse:10.546858+0.428558 
## [179]    train-rmse:10.219581+0.151138   test-rmse:10.530361+0.429606 
## [180]    train-rmse:10.203279+0.159297   test-rmse:10.516089+0.436302 
## [181]    train-rmse:10.170474+0.150174   test-rmse:10.484254+0.432712 
## [182]    train-rmse:10.150607+0.143100   test-rmse:10.466180+0.432907 
## [183]    train-rmse:10.128919+0.139769   test-rmse:10.447276+0.430366 
## [184]    train-rmse:10.116444+0.136392   test-rmse:10.434183+0.428852 
## [185]    train-rmse:10.103238+0.137503   test-rmse:10.421408+0.425913 
## [186]    train-rmse:10.079543+0.141208   test-rmse:10.398535+0.423641 
## [187]    train-rmse:10.061515+0.145919   test-rmse:10.380218+0.420702 
## [188]    train-rmse:10.043661+0.140927   test-rmse:10.364025+0.420200 
## [189]    train-rmse:10.030702+0.136669   test-rmse:10.351242+0.418083 
## [190]    train-rmse:10.013099+0.132077   test-rmse:10.334806+0.417904 
## [191]    train-rmse:10.003224+0.131046   test-rmse:10.325222+0.417186 
## [192]    train-rmse:9.977794+0.133473    test-rmse:10.303993+0.409037 
## [193]    train-rmse:9.955110+0.127801    test-rmse:10.281610+0.414874 
## [194]    train-rmse:9.937767+0.123408    test-rmse:10.264488+0.410173 
## [195]    train-rmse:9.920471+0.119351    test-rmse:10.248803+0.402870 
## [196]    train-rmse:9.904807+0.119572    test-rmse:10.232660+0.404553 
## [197]    train-rmse:9.894151+0.118215    test-rmse:10.223174+0.406781 
## [198]    train-rmse:9.877639+0.119703    test-rmse:10.207268+0.411335 
## [199]    train-rmse:9.860826+0.124701    test-rmse:10.190674+0.413868 
## [200]    train-rmse:9.851560+0.122998    test-rmse:10.181444+0.415317 
## [201]    train-rmse:9.836962+0.122353    test-rmse:10.168148+0.414157 
## [202]    train-rmse:9.821500+0.126547    test-rmse:10.152826+0.413544 
## [203]    train-rmse:9.808001+0.120111    test-rmse:10.139671+0.416198 
## [204]    train-rmse:9.789120+0.120033    test-rmse:10.120445+0.409768 
## [205]    train-rmse:9.770634+0.111066    test-rmse:10.101590+0.413240 
## [206]    train-rmse:9.756057+0.113862    test-rmse:10.087464+0.410996 
## [207]    train-rmse:9.740654+0.108818    test-rmse:10.073008+0.413456 
## [208]    train-rmse:9.729500+0.108962    test-rmse:10.059912+0.405195 
## [209]    train-rmse:9.713300+0.105780    test-rmse:10.045207+0.405790 
## [210]    train-rmse:9.701051+0.106585    test-rmse:10.034742+0.410114 
## [211]    train-rmse:9.682118+0.106183    test-rmse:10.017781+0.404554 
## [212]    train-rmse:9.673854+0.103980    test-rmse:10.010321+0.405675 
## [213]    train-rmse:9.654135+0.100130    test-rmse:9.992298+0.406646 
## [214]    train-rmse:9.640481+0.102486    test-rmse:9.979878+0.404457 
## [215]    train-rmse:9.632062+0.104691    test-rmse:9.971423+0.405219 
## [216]    train-rmse:9.621026+0.101507    test-rmse:9.962105+0.406768 
## [217]    train-rmse:9.608985+0.099291    test-rmse:9.950140+0.404004 
## [218]    train-rmse:9.594721+0.104760    test-rmse:9.936102+0.401648 
## [219]    train-rmse:9.582296+0.103846    test-rmse:9.922834+0.399677 
## [220]    train-rmse:9.567258+0.100433    test-rmse:9.910855+0.402802 
## [221]    train-rmse:9.558158+0.102900    test-rmse:9.902499+0.404595 
## [222]    train-rmse:9.542580+0.107015    test-rmse:9.888357+0.400432 
## [223]    train-rmse:9.523993+0.107321    test-rmse:9.871519+0.388932 
## [224]    train-rmse:9.507012+0.114603    test-rmse:9.856031+0.382138 
## [225]    train-rmse:9.499607+0.113914    test-rmse:9.848538+0.383890 
## [226]    train-rmse:9.487122+0.115743    test-rmse:9.835279+0.381230 
## [227]    train-rmse:9.477153+0.117298    test-rmse:9.824331+0.382217 
## [228]    train-rmse:9.468409+0.117378    test-rmse:9.816470+0.382314 
## [229]    train-rmse:9.456718+0.116577    test-rmse:9.805238+0.383098 
## [230]    train-rmse:9.443708+0.116666    test-rmse:9.792541+0.381910 
## [231]    train-rmse:9.434401+0.122522    test-rmse:9.783843+0.384327 
## [232]    train-rmse:9.424836+0.123293    test-rmse:9.774644+0.381195 
## [233]    train-rmse:9.417189+0.125157    test-rmse:9.766897+0.382082 
## [234]    train-rmse:9.404531+0.128939    test-rmse:9.754988+0.380096 
## [235]    train-rmse:9.385700+0.135677    test-rmse:9.736225+0.384938 
## [236]    train-rmse:9.380681+0.136036    test-rmse:9.731155+0.385226 
## [237]    train-rmse:9.369075+0.131875    test-rmse:9.721336+0.385930 
## [238]    train-rmse:9.356988+0.128752    test-rmse:9.709838+0.389196 
## [239]    train-rmse:9.347853+0.125199    test-rmse:9.701602+0.388253 
## [240]    train-rmse:9.343640+0.124666    test-rmse:9.697530+0.388757 
## [241]    train-rmse:9.332742+0.128993    test-rmse:9.686036+0.388375 
## [242]    train-rmse:9.322995+0.130996    test-rmse:9.676602+0.390244 
## [243]    train-rmse:9.315816+0.131785    test-rmse:9.669880+0.388172 
## [244]    train-rmse:9.305006+0.136571    test-rmse:9.658989+0.391856 
## [245]    train-rmse:9.298165+0.134730    test-rmse:9.652405+0.392567 
## [246]    train-rmse:9.290600+0.134522    test-rmse:9.645570+0.395199 
## [247]    train-rmse:9.281089+0.136824    test-rmse:9.637397+0.396009 
## [248]    train-rmse:9.266041+0.134092    test-rmse:9.624377+0.392818 
## [249]    train-rmse:9.259023+0.134405    test-rmse:9.617109+0.390630 
## [250]    train-rmse:9.242450+0.128554    test-rmse:9.600505+0.390931 
## [251]    train-rmse:9.234533+0.128259    test-rmse:9.593570+0.393001 
## [252]    train-rmse:9.220228+0.126664    test-rmse:9.579589+0.389928 
## [253]    train-rmse:9.212157+0.128353    test-rmse:9.572581+0.386242 
## [254]    train-rmse:9.205359+0.128029    test-rmse:9.566911+0.387810 
## [255]    train-rmse:9.199803+0.128730    test-rmse:9.562516+0.386857 
## [256]    train-rmse:9.192243+0.131254    test-rmse:9.555527+0.386119 
## [257]    train-rmse:9.180939+0.124446    test-rmse:9.545549+0.387471 
## [258]    train-rmse:9.173330+0.127704    test-rmse:9.538570+0.386550 
## [259]    train-rmse:9.165204+0.127079    test-rmse:9.528723+0.379746 
## [260]    train-rmse:9.157089+0.127403    test-rmse:9.521460+0.377256 
## [261]    train-rmse:9.149003+0.128913    test-rmse:9.512727+0.372564 
## [262]    train-rmse:9.136132+0.125149    test-rmse:9.500968+0.377840 
## [263]    train-rmse:9.127902+0.126763    test-rmse:9.493249+0.377592 
## [264]    train-rmse:9.123005+0.126542    test-rmse:9.488505+0.377370 
## [265]    train-rmse:9.117965+0.127221    test-rmse:9.481694+0.370797 
## [266]    train-rmse:9.109887+0.124306    test-rmse:9.473039+0.369011 
## [267]    train-rmse:9.104673+0.123309    test-rmse:9.467989+0.368060 
## [268]    train-rmse:9.092086+0.125318    test-rmse:9.457427+0.360778 
## [269]    train-rmse:9.077100+0.126328    test-rmse:9.443831+0.362886 
## [270]    train-rmse:9.071003+0.126513    test-rmse:9.437746+0.363740 
## [271]    train-rmse:9.064710+0.123078    test-rmse:9.431797+0.362664 
## [272]    train-rmse:9.057125+0.119671    test-rmse:9.425193+0.365882 
## [273]    train-rmse:9.048040+0.119653    test-rmse:9.417839+0.364476 
## [274]    train-rmse:9.038809+0.115257    test-rmse:9.409662+0.365498 
## [275]    train-rmse:9.033037+0.117989    test-rmse:9.404609+0.363848 
## [276]    train-rmse:9.028031+0.116568    test-rmse:9.399984+0.364466 
## [277]    train-rmse:9.020843+0.115073    test-rmse:9.391625+0.359804 
## [278]    train-rmse:9.013157+0.117254    test-rmse:9.384618+0.358877 
## [279]    train-rmse:9.007279+0.118086    test-rmse:9.379630+0.361089 
## [280]    train-rmse:9.002462+0.117524    test-rmse:9.373640+0.358969 
## [281]    train-rmse:8.993344+0.118662    test-rmse:9.365366+0.355980 
## [282]    train-rmse:8.983434+0.119871    test-rmse:9.355912+0.355419 
## [283]    train-rmse:8.977119+0.118818    test-rmse:9.348593+0.354322 
## [284]    train-rmse:8.970545+0.118439    test-rmse:9.342851+0.353287 
## [285]    train-rmse:8.965399+0.118460    test-rmse:9.338003+0.352122 
## [286]    train-rmse:8.956637+0.118442    test-rmse:9.329433+0.348805 
## [287]    train-rmse:8.947091+0.124131    test-rmse:9.319769+0.344848 
## [288]    train-rmse:8.942395+0.124864    test-rmse:9.315539+0.345110 
## [289]    train-rmse:8.936625+0.124930    test-rmse:9.309399+0.342654 
## [290]    train-rmse:8.930326+0.124412    test-rmse:9.303534+0.343498 
## [291]    train-rmse:8.924509+0.123912    test-rmse:9.298911+0.342461 
## [292]    train-rmse:8.918680+0.127292    test-rmse:9.293353+0.339355 
## [293]    train-rmse:8.909154+0.127753    test-rmse:9.284326+0.341114 
## [294]    train-rmse:8.897689+0.124022    test-rmse:9.272688+0.342353 
## [295]    train-rmse:8.891725+0.126154    test-rmse:9.266887+0.344899 
## [296]    train-rmse:8.887268+0.124457    test-rmse:9.262222+0.344258 
## [297]    train-rmse:8.877921+0.124734    test-rmse:9.253354+0.340558 
## [298]    train-rmse:8.868206+0.125806    test-rmse:9.243345+0.334405 
## [299]    train-rmse:8.863985+0.124192    test-rmse:9.238774+0.335266 
## [300]    train-rmse:8.855446+0.122762    test-rmse:9.231752+0.337541 
## [301]    train-rmse:8.848531+0.126742    test-rmse:9.225046+0.339937 
## [302]    train-rmse:8.842111+0.123371    test-rmse:9.220187+0.339654 
## [303]    train-rmse:8.833475+0.122159    test-rmse:9.211904+0.342848 
## [304]    train-rmse:8.824214+0.118498    test-rmse:9.201571+0.345484 
## [305]    train-rmse:8.819610+0.117201    test-rmse:9.197223+0.347339 
## [306]    train-rmse:8.814282+0.115841    test-rmse:9.192321+0.348523 
## [307]    train-rmse:8.806472+0.121544    test-rmse:9.185030+0.347505 
## [308]    train-rmse:8.801235+0.119215    test-rmse:9.181076+0.348473 
## [309]    train-rmse:8.797045+0.118628    test-rmse:9.177992+0.349114 
## [310]    train-rmse:8.792905+0.117712    test-rmse:9.174512+0.349104 
## [311]    train-rmse:8.788173+0.119693    test-rmse:9.169930+0.349259 
## [312]    train-rmse:8.780563+0.119018    test-rmse:9.162828+0.348043 
## [313]    train-rmse:8.769463+0.115788    test-rmse:9.151635+0.360224 
## [314]    train-rmse:8.758913+0.112986    test-rmse:9.141717+0.367362 
## [315]    train-rmse:8.749969+0.111973    test-rmse:9.131876+0.372262 
## [316]    train-rmse:8.742927+0.112606    test-rmse:9.126892+0.370611 
## [317]    train-rmse:8.737209+0.111397    test-rmse:9.121566+0.372515 
## [318]    train-rmse:8.729966+0.108540    test-rmse:9.115212+0.372855 
## [319]    train-rmse:8.727226+0.107770    test-rmse:9.113017+0.373090 
## [320]    train-rmse:8.723138+0.105875    test-rmse:9.109239+0.372929 
## [321]    train-rmse:8.716674+0.101418    test-rmse:9.102372+0.371024 
## [322]    train-rmse:8.713060+0.102855    test-rmse:9.099127+0.369099 
## [323]    train-rmse:8.709337+0.104452    test-rmse:9.096110+0.368832 
## [324]    train-rmse:8.700069+0.100996    test-rmse:9.086214+0.365149 
## [325]    train-rmse:8.689189+0.099130    test-rmse:9.072684+0.366924 
## [326]    train-rmse:8.684540+0.101045    test-rmse:9.067893+0.368995 
## [327]    train-rmse:8.681166+0.102288    test-rmse:9.065271+0.369636 
## [328]    train-rmse:8.675911+0.102319    test-rmse:9.059807+0.371351 
## [329]    train-rmse:8.670249+0.103454    test-rmse:9.053705+0.372845 
## [330]    train-rmse:8.656713+0.101649    test-rmse:9.040555+0.371421 
## [331]    train-rmse:8.651829+0.105010    test-rmse:9.036664+0.370880 
## [332]    train-rmse:8.642512+0.101655    test-rmse:9.028887+0.370997 
## [333]    train-rmse:8.637955+0.101776    test-rmse:9.024683+0.369508 
## [334]    train-rmse:8.631709+0.104609    test-rmse:9.017291+0.363328 
## [335]    train-rmse:8.624763+0.107185    test-rmse:9.009563+0.362530 
## [336]    train-rmse:8.621722+0.107305    test-rmse:9.006493+0.362443 
## [337]    train-rmse:8.614239+0.108263    test-rmse:9.000021+0.362081 
## [338]    train-rmse:8.612366+0.107792    test-rmse:8.998311+0.362194 
## [339]    train-rmse:8.603698+0.105506    test-rmse:8.989673+0.366931 
## [340]    train-rmse:8.594845+0.106247    test-rmse:8.982219+0.364170 
## [341]    train-rmse:8.582654+0.109504    test-rmse:8.970789+0.364589 
## [342]    train-rmse:8.577492+0.109999    test-rmse:8.966339+0.363695 
## [343]    train-rmse:8.568043+0.110500    test-rmse:8.956634+0.363421 
## [344]    train-rmse:8.564476+0.109676    test-rmse:8.954390+0.363080 
## [345]    train-rmse:8.556497+0.107998    test-rmse:8.947262+0.360155 
## [346]    train-rmse:8.549809+0.108851    test-rmse:8.941726+0.358756 
## [347]    train-rmse:8.545305+0.106709    test-rmse:8.937196+0.358693 
## [348]    train-rmse:8.539340+0.110751    test-rmse:8.930907+0.356349 
## [349]    train-rmse:8.535605+0.109778    test-rmse:8.927714+0.358485 
## [350]    train-rmse:8.527952+0.106946    test-rmse:8.920723+0.357695 
## [351]    train-rmse:8.522131+0.108023    test-rmse:8.913847+0.353007 
## [352]    train-rmse:8.514757+0.104509    test-rmse:8.907511+0.350396 
## [353]    train-rmse:8.508233+0.106299    test-rmse:8.898559+0.345045 
## [354]    train-rmse:8.504391+0.105023    test-rmse:8.895302+0.345687 
## [355]    train-rmse:8.502040+0.105116    test-rmse:8.893383+0.346496 
## [356]    train-rmse:8.493789+0.111683    test-rmse:8.886837+0.341131 
## [357]    train-rmse:8.487276+0.109294    test-rmse:8.880791+0.340058 
## [358]    train-rmse:8.484624+0.109270    test-rmse:8.879358+0.340268 
## [359]    train-rmse:8.481388+0.108789    test-rmse:8.875801+0.340265 
## [360]    train-rmse:8.472115+0.113875    test-rmse:8.867973+0.334363 
## [361]    train-rmse:8.467550+0.112128    test-rmse:8.864451+0.334259 
## [362]    train-rmse:8.465397+0.112407    test-rmse:8.863079+0.334470 
## [363]    train-rmse:8.459870+0.111300    test-rmse:8.857651+0.339148 
## [364]    train-rmse:8.454519+0.109234    test-rmse:8.852782+0.337630 
## [365]    train-rmse:8.449811+0.111535    test-rmse:8.848550+0.335505 
## [366]    train-rmse:8.441154+0.117538    test-rmse:8.839529+0.336672 
## [367]    train-rmse:8.437111+0.118444    test-rmse:8.835668+0.335652 
## [368]    train-rmse:8.430080+0.114298    test-rmse:8.829018+0.335797 
## [369]    train-rmse:8.425472+0.112264    test-rmse:8.825060+0.337070 
## [370]    train-rmse:8.419826+0.108449    test-rmse:8.819821+0.336932 
## [371]    train-rmse:8.414648+0.106507    test-rmse:8.815369+0.336799 
## [372]    train-rmse:8.410266+0.108397    test-rmse:8.811965+0.336446 
## [373]    train-rmse:8.404494+0.110066    test-rmse:8.806865+0.335359 
## [374]    train-rmse:8.398760+0.111229    test-rmse:8.802126+0.332824 
## [375]    train-rmse:8.394882+0.113068    test-rmse:8.797074+0.328081 
## [376]    train-rmse:8.388706+0.111188    test-rmse:8.791768+0.328663 
## [377]    train-rmse:8.380818+0.106269    test-rmse:8.784194+0.331310 
## [378]    train-rmse:8.376018+0.106524    test-rmse:8.780105+0.329733 
## [379]    train-rmse:8.372654+0.106806    test-rmse:8.777394+0.330116 
## [380]    train-rmse:8.365086+0.108462    test-rmse:8.769355+0.329771 
## [381]    train-rmse:8.360022+0.107402    test-rmse:8.765104+0.332241 
## [382]    train-rmse:8.356217+0.109956    test-rmse:8.761511+0.330954 
## [383]    train-rmse:8.351346+0.110337    test-rmse:8.757184+0.330023 
## [384]    train-rmse:8.348650+0.110514    test-rmse:8.754745+0.328900 
## [385]    train-rmse:8.345124+0.111960    test-rmse:8.751946+0.328842 
## [386]    train-rmse:8.341696+0.111095    test-rmse:8.749252+0.327808 
## [387]    train-rmse:8.333018+0.107681    test-rmse:8.742530+0.326593 
## [388]    train-rmse:8.326462+0.114535    test-rmse:8.737281+0.321928 
## [389]    train-rmse:8.321180+0.113817    test-rmse:8.732704+0.321114 
## [390]    train-rmse:8.314883+0.111216    test-rmse:8.724883+0.317757 
## [391]    train-rmse:8.311658+0.109820    test-rmse:8.722228+0.317511 
## [392]    train-rmse:8.307268+0.109485    test-rmse:8.718089+0.317679 
## [393]    train-rmse:8.299887+0.106847    test-rmse:8.711557+0.321601 
## [394]    train-rmse:8.296377+0.108840    test-rmse:8.708518+0.319477 
## [395]    train-rmse:8.284252+0.101986    test-rmse:8.696319+0.318920 
## [396]    train-rmse:8.277013+0.098498    test-rmse:8.689118+0.317924 
## [397]    train-rmse:8.271712+0.100596    test-rmse:8.684471+0.317809 
## [398]    train-rmse:8.266528+0.098587    test-rmse:8.678899+0.316782 
## [399]    train-rmse:8.262553+0.097379    test-rmse:8.674878+0.315652 
## [400]    train-rmse:8.255024+0.104937    test-rmse:8.668643+0.311825 
## [401]    train-rmse:8.250928+0.105900    test-rmse:8.665632+0.310449 
## [402]    train-rmse:8.246820+0.104919    test-rmse:8.661500+0.311074 
## [403]    train-rmse:8.240763+0.102843    test-rmse:8.656542+0.316113 
## [404]    train-rmse:8.236864+0.103735    test-rmse:8.652959+0.314765 
## [405]    train-rmse:8.234394+0.104069    test-rmse:8.651131+0.314362 
## [406]    train-rmse:8.231032+0.104143    test-rmse:8.648427+0.313337 
## [407]    train-rmse:8.223114+0.104787    test-rmse:8.642043+0.317244 
## [408]    train-rmse:8.215830+0.103464    test-rmse:8.635102+0.315294 
## [409]    train-rmse:8.211355+0.103566    test-rmse:8.630602+0.318304 
## [410]    train-rmse:8.208338+0.104152    test-rmse:8.627650+0.319815 
## [411]    train-rmse:8.200743+0.105829    test-rmse:8.620149+0.316057 
## [412]    train-rmse:8.194809+0.105608    test-rmse:8.613854+0.315495 
## [413]    train-rmse:8.189666+0.107870    test-rmse:8.607138+0.311749 
## [414]    train-rmse:8.185607+0.109902    test-rmse:8.603459+0.310936 
## [415]    train-rmse:8.182045+0.110686    test-rmse:8.600648+0.310870 
## [416]    train-rmse:8.179227+0.109856    test-rmse:8.598332+0.309993 
## [417]    train-rmse:8.174266+0.109494    test-rmse:8.592953+0.308735 
## [418]    train-rmse:8.167821+0.108815    test-rmse:8.587512+0.309684 
## [419]    train-rmse:8.161695+0.107251    test-rmse:8.581133+0.308699 
## [420]    train-rmse:8.157865+0.106188    test-rmse:8.577647+0.307995 
## [421]    train-rmse:8.154206+0.105058    test-rmse:8.574034+0.307711 
## [422]    train-rmse:8.152153+0.104021    test-rmse:8.572143+0.309135 
## [423]    train-rmse:8.148262+0.103377    test-rmse:8.569415+0.308855 
## [424]    train-rmse:8.143459+0.100853    test-rmse:8.565144+0.311188 
## [425]    train-rmse:8.139362+0.100260    test-rmse:8.561183+0.312183 
## [426]    train-rmse:8.137227+0.099973    test-rmse:8.559392+0.312494 
## [427]    train-rmse:8.134416+0.098866    test-rmse:8.557185+0.313116 
## [428]    train-rmse:8.130422+0.101029    test-rmse:8.551822+0.308770 
## [429]    train-rmse:8.125710+0.100204    test-rmse:8.548025+0.307216 
## [430]    train-rmse:8.122021+0.100741    test-rmse:8.543457+0.304219 
## [431]    train-rmse:8.118519+0.102938    test-rmse:8.538539+0.301268 
## [432]    train-rmse:8.115112+0.105820    test-rmse:8.534198+0.298014 
## [433]    train-rmse:8.109576+0.106966    test-rmse:8.529557+0.297208 
## [434]    train-rmse:8.104453+0.105182    test-rmse:8.525045+0.300921 
## [435]    train-rmse:8.102312+0.104614    test-rmse:8.523535+0.300780 
## [436]    train-rmse:8.099404+0.105157    test-rmse:8.521761+0.298555 
## [437]    train-rmse:8.097651+0.105387    test-rmse:8.519973+0.298603 
## [438]    train-rmse:8.092798+0.107474    test-rmse:8.515911+0.298282 
## [439]    train-rmse:8.087656+0.105214    test-rmse:8.511749+0.297680 
## [440]    train-rmse:8.083394+0.103154    test-rmse:8.508649+0.298580 
## [441]    train-rmse:8.080258+0.103120    test-rmse:8.505164+0.297517 
## [442]    train-rmse:8.076874+0.105540    test-rmse:8.502965+0.295661 
## [443]    train-rmse:8.073629+0.106265    test-rmse:8.500793+0.295514 
## [444]    train-rmse:8.069715+0.106382    test-rmse:8.497350+0.297009 
## [445]    train-rmse:8.066160+0.106491    test-rmse:8.494838+0.295578 
## [446]    train-rmse:8.063038+0.106727    test-rmse:8.492668+0.295896 
## [447]    train-rmse:8.058164+0.107094    test-rmse:8.487546+0.296164 
## [448]    train-rmse:8.054766+0.108186    test-rmse:8.485156+0.295207 
## [449]    train-rmse:8.052449+0.108331    test-rmse:8.483192+0.294671 
## [450]    train-rmse:8.048032+0.108651    test-rmse:8.479274+0.295044 
## [451]    train-rmse:8.045126+0.109984    test-rmse:8.477350+0.294835 
## [452]    train-rmse:8.038511+0.112459    test-rmse:8.470843+0.295767 
## [453]    train-rmse:8.033699+0.110321    test-rmse:8.465756+0.294978 
## [454]    train-rmse:8.027253+0.108786    test-rmse:8.459952+0.294763 
## [455]    train-rmse:8.023795+0.107541    test-rmse:8.455992+0.294531 
## [456]    train-rmse:8.020175+0.109345    test-rmse:8.453409+0.293265 
## [457]    train-rmse:8.014451+0.112010    test-rmse:8.447573+0.292246 
## [458]    train-rmse:8.012392+0.111272    test-rmse:8.446089+0.291898 
## [459]    train-rmse:8.008706+0.111259    test-rmse:8.443013+0.291223 
## [460]    train-rmse:8.005024+0.112409    test-rmse:8.440601+0.291097 
## [461]    train-rmse:8.001924+0.112444    test-rmse:8.437531+0.291513 
## [462]    train-rmse:7.998080+0.113186    test-rmse:8.432672+0.289153 
## [463]    train-rmse:7.994006+0.113843    test-rmse:8.429280+0.289369 
## [464]    train-rmse:7.987688+0.113642    test-rmse:8.424036+0.291415 
## [465]    train-rmse:7.984455+0.112733    test-rmse:8.420100+0.289553 
## [466]    train-rmse:7.979910+0.112689    test-rmse:8.416054+0.290919 
## [467]    train-rmse:7.976199+0.110366    test-rmse:8.412339+0.289246 
## [468]    train-rmse:7.973880+0.110837    test-rmse:8.410325+0.289415 
## [469]    train-rmse:7.970769+0.111544    test-rmse:8.407548+0.289310 
## [470]    train-rmse:7.967203+0.111272    test-rmse:8.404503+0.288580 
## [471]    train-rmse:7.963899+0.110897    test-rmse:8.402371+0.288409 
## [472]    train-rmse:7.960302+0.110280    test-rmse:8.399463+0.290735 
## [473]    train-rmse:7.953855+0.111236    test-rmse:8.393564+0.289393 
## [474]    train-rmse:7.949619+0.113152    test-rmse:8.390611+0.288573 
## [475]    train-rmse:7.944849+0.111194    test-rmse:8.385507+0.286848 
## [476]    train-rmse:7.941335+0.110653    test-rmse:8.383277+0.288130 
## [477]    train-rmse:7.938676+0.111717    test-rmse:8.381802+0.286679 
## [478]    train-rmse:7.934691+0.114084    test-rmse:8.379222+0.285194 
## [479]    train-rmse:7.932671+0.113550    test-rmse:8.377414+0.285567 
## [480]    train-rmse:7.930535+0.114241    test-rmse:8.376175+0.285055 
## [481]    train-rmse:7.926017+0.114892    test-rmse:8.370511+0.281872 
## [482]    train-rmse:7.920363+0.116034    test-rmse:8.365403+0.281006 
## [483]    train-rmse:7.916318+0.116981    test-rmse:8.361843+0.282089 
## [484]    train-rmse:7.911390+0.115963    test-rmse:8.358424+0.284879 
## [485]    train-rmse:7.909783+0.115574    test-rmse:8.357141+0.285639 
## [486]    train-rmse:7.905772+0.113984    test-rmse:8.352969+0.287693 
## [487]    train-rmse:7.903043+0.114656    test-rmse:8.350924+0.287935 
## [488]    train-rmse:7.898748+0.115707    test-rmse:8.347681+0.286642 
## [489]    train-rmse:7.895827+0.115516    test-rmse:8.344660+0.286553 
## [490]    train-rmse:7.894121+0.116428    test-rmse:8.343343+0.286141 
## [491]    train-rmse:7.890137+0.118835    test-rmse:8.340771+0.285379 
## [492]    train-rmse:7.884099+0.119281    test-rmse:8.335552+0.285053 
## [493]    train-rmse:7.879485+0.119768    test-rmse:8.331183+0.286299 
## [494]    train-rmse:7.876964+0.119746    test-rmse:8.328944+0.287170 
## [495]    train-rmse:7.872779+0.117897    test-rmse:8.324450+0.291334 
## [496]    train-rmse:7.869536+0.117146    test-rmse:8.321783+0.291531 
## [497]    train-rmse:7.866731+0.118037    test-rmse:8.319353+0.290080 
## [498]    train-rmse:7.865417+0.117851    test-rmse:8.318216+0.289873 
## [499]    train-rmse:7.856918+0.115144    test-rmse:8.309280+0.285302 
## [500]    train-rmse:7.853476+0.115487    test-rmse:8.306489+0.284709
best_nrounds <- cv_results$best_iteration
# Train the final model using the best number of rounds found
model_xgb <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb, dtrain)
test_pred <- predict(model_xgb, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)

# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)

train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
cat("Model Performance Metrics:\n",
    "--------------------------\n",
    "Training RMSE: ", train_rmse, "\n",
    "Test RMSE: ", test_rmse, "\n",
    "Training R-squared: ", r_squared_train, "\n",
    "Test R-squared: ", r_squared_test, "\n",
    "Training MAE: ", train_mae, "\n",
    "Test MAE: ", test_mae, "\n",
    "Training MAPE: ", train_mape, "%\n",
    "Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 7.933109
## Test RMSE: 8.062598
## Training R-squared: 0.9824703
## Test R-squared: 0.9816268
## Training MAE: 4.424568
## Test MAE: 4.50928
## Training MAPE: 28.69924%
## Test MAPE: 29.15366%
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:

residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred

residuals_data <- data.frame(
  Residuals = c(residuals_train, residuals_test),
  Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)

# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
  geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
  facet_wrap(~ Dataset) +
  ggtitle('Residuals Distribution')

# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined

# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
  Actual = c(train_labels, test_labels),
  Predicted = c(train_pred, test_pred),
  Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)

# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
  xlab('Actual Values') +
  ylab('Predicted Values') +
  scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
  ggtitle('Actual vs. Predicted Values')

library(xgboost)

# Calculate feature importance
importance_matrix <- xgb.importance(feature_names = colnames(train_features), model = model_xgb)

# View the feature importance scores
print(importance_matrix)
##                                Feature         Gain        Cover    Frequency
##                                 <char>        <num>        <num>        <num>
##  1:                       DOLLAR_SALES 7.250497e-01 3.190890e-01 0.2587792642
##  2:                  12SMALL 12ONE CUP 1.111782e-01 7.959344e-02 0.0592251951
##  3:                  12SMALL 24ONE CUP 4.425131e-02 5.997315e-02 0.0386008919
##  4:                 WEEKS_SINCE_LAUNCH 2.499099e-02 1.157388e-01 0.1399108138
##  5:                  20SMALL MULTI JUG 1.713063e-02 3.603165e-02 0.0381828317
##  6:                           POP_SQMI 8.908652e-03 5.965322e-02 0.1082775920
##  7:           12SMALL MLT PLASTICS JUG 8.846117e-03 1.978074e-02 0.0202062430
##  8:                     REGION_ARIZONA 7.742566e-03 9.364009e-03 0.0257803790
##  9:       ITEM_BEAUTIFUL GREENER  PLUM 6.940521e-03 2.274741e-02 0.0274526198
## 10:                  12SMALL 18ONE CUP 6.188074e-03 3.062063e-02 0.0218784838
## 11:        ITEM_RAINING  THRASHED PLUM 5.649968e-03 1.197710e-02 0.0209030100
## 12:                       REGION_SOCAL 5.218939e-03 8.922647e-03 0.0197881828
## 13:                       .5L 6ONE JUG 3.726497e-03 1.397857e-02 0.0119843924
## 14:            BRAND_BEAUTIFUL GREENER 3.409390e-03 8.407059e-03 0.0096153846
## 15:                    CALORIC_SEGMENT 3.301499e-03 7.090121e-03 0.0089186176
## 16:                    REGION_MOUNTAIN 2.358481e-03 1.632401e-02 0.0091973244
## 17:                 ITEM_ZIZZLES  PLUM 2.354024e-03 8.482146e-03 0.0083612040
## 18:                  12SMALL 20ONE CUP 2.275258e-03 2.722203e-02 0.0137959866
## 19:                       WEEK_OF_YEAR 2.089544e-03 4.534149e-02 0.0728818283
## 20:                 BRAND_SINGLE GROUP 1.860115e-03 3.376159e-03 0.0036231884
## 21:                ITEM_ SMASH  SUNSET 1.568908e-03 3.105690e-03 0.0032051282
## 22:                      REGION_KANSAS 9.756360e-04 2.029609e-02 0.0126811594
## 23:                    REGION_COLORADO 8.989604e-04 7.888115e-03 0.0100334448
## 24:                 REGION_CALI_NEVADA 7.861451e-04 9.748476e-03 0.0048773690
## 25:         12SMALL 24ONE PLASTICS JUG 6.618125e-04 2.513283e-02 0.0089186176
## 26: BRAND_HILL MOISTURE THRASHED APPLE 5.271685e-04 3.925149e-03 0.0066889632
## 27:                    REGION_NORTHERN 4.438187e-04 4.679586e-03 0.0085005574
## 28:                   REGION_DESERT_SW 4.156559e-04 1.583096e-02 0.0147714604
## 29:                   BRAND_DIET SMASH 1.136502e-04 1.033363e-03 0.0018115942
## 30:                       REGION_NOCAL 6.921370e-05 1.233931e-03 0.0062709030
## 31:                   REGION_NEWMEXICO 4.339678e-05 4.059228e-05 0.0018115942
## 32:                     REGION_PRAIRIE 2.111430e-05 3.943581e-04 0.0018115942
## 33:                   12SMALL 6ONE CUP 3.414952e-06 2.924409e-03 0.0009754738
## 34:                       ITEM_GO-DAY  3.013295e-07 2.260741e-05 0.0001393534
## 35:                       BRAND_GO-DAY 2.776568e-07 3.049391e-05 0.0001393534
##                                Feature         Gain        Cover    Frequency
# Plot the feature importance
xgb.plot.importance(importance_matrix = importance_matrix)

# Compute partial dependence data for 'DOLLAR_SALES' and 'plum', CALORIC_SEGMENT, and "ENERGY
# pd <- partial(model_xgb, pred.var = c("DOLLAR_SALES", "plum", "CALORIC_SEGMENT", ENERGY"), train = train_features, grid.resolution = 20)
# 
# # Default PDP
# pdp1 <- plotPartial(pd, plot = TRUE)
# 
# # Add contour lines and use a different color palette
# rwb <- colorRampPalette(c("red", "white", "blue"))
# pdp2 <- plotPartial(pd, contour = TRUE, col.regions = rwb)
# 
# # 3-D surface
# pdp3 <- plotPartial(pd, levelplot = FALSE, zlab = "Predicted Outcome", drape = TRUE, colorkey = TRUE, screen = list(z = -20, x = -60))
# 
# # Combine plots into one window
# grid.arrange(pdp1, pdp2, pdp3, ncol = 3)

XGBOOST Model #2

Model with NO DOLLAR SALES Variable

# Assuming 'df' is your complete dataframe and 'UNIT_SALES' is your target variable
df2 <- df
# Remove DOLLAR_SALES from the features
df2$DOLLAR_SALES <- NULL

# Split the updated data into training and testing sets (assuming you're using a similar approach as before)
set.seed(123)
df2_testtrn <- initial_split(df2, prop = 0.8, strata = UNIT_SALES)
Train <- training(df2_testtrn)
Test <- testing(df2_testtrn)

# Prepare features and labels for XGBoost, excluding DOLLAR_SALES
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES

# Convert data to DMatrix format for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Assuming 'params' and 'best_nrounds' are defined as before

# Train the final model without DOLLAR_SALES
model_xgb_no_dollar_sales <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb_no_dollar_sales, dtrain)
test_pred <- predict(model_xgb_no_dollar_sales, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)

# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)

train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:

residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred

residuals_data <- data.frame(
  Residuals = c(residuals_train, residuals_test),
  Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)

# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
  geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
  facet_wrap(~ Dataset) +
  ggtitle('Residuals Distribution')

# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined

# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
  Actual = c(train_labels, test_labels),
  Predicted = c(train_pred, test_pred),
  Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)

# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
  xlab('Actual Values') +
  ylab('Predicted Values') +
  scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
  ggtitle('Actual vs. Predicted Values')

cat("Model Performance Metrics:\n",
    "--------------------------\n",
    "Training RMSE: ", train_rmse, "\n",
    "Test RMSE: ", test_rmse, "\n",
    "Training R-squared: ", r_squared_train, "\n",
    "Test R-squared: ", r_squared_test, "\n",
    "Training MAE: ", train_mae, "\n",
    "Test MAE: ", test_mae, "\n",
    "Training MAPE: ", train_mape, "%\n",
    "Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 41.82988
## Test RMSE: 41.64189
## Training R-squared: 0.5126277
## Test R-squared: 0.5098883
## Training MAE: 26.72833
## Test MAE: 26.70301
## Training MAPE: 245.5749%
## Test MAPE: 242.3915%
# Calculate feature importance
importance_matrix2 <- xgb.importance(feature_names = colnames(train_features), model = model_xgb_no_dollar_sales)

# View the feature importance scores
print(importance_matrix2)
##                                Feature         Gain        Cover    Frequency
##                                 <char>        <num>        <num>        <num>
##  1:                           POP_SQMI 3.216350e-01 3.099819e-01 0.3096774194
##  2:       ITEM_BEAUTIFUL GREENER  PLUM 1.074171e-01 3.090875e-02 0.0438709677
##  3:                 WEEKS_SINCE_LAUNCH 6.342494e-02 6.155881e-02 0.1195698925
##  4:                       REGION_SOCAL 5.763006e-02 3.082725e-02 0.0417204301
##  5:        ITEM_RAINING  THRASHED PLUM 4.364411e-02 9.919278e-03 0.0225089606
##  6:                  12SMALL 12ONE CUP 4.290751e-02 1.348118e-02 0.0210752688
##  7:            BRAND_BEAUTIFUL GREENER 4.080563e-02 1.230483e-02 0.0203584229
##  8:                  20SMALL MULTI JUG 3.799602e-02 2.140640e-02 0.0278136201
##  9:                     REGION_ARIZONA 3.096537e-02 1.168918e-02 0.0365591398
## 10:                    REGION_MOUNTAIN 3.034199e-02 7.063550e-03 0.0127598566
## 11:           12SMALL MLT PLASTICS JUG 2.540258e-02 6.411313e-02 0.0255197133
## 12:                       .5L 6ONE JUG 2.513305e-02 6.938326e-02 0.0210752688
## 13:                    REGION_NORTHERN 1.905685e-02 4.979571e-03 0.0090322581
## 14:                  12SMALL 18ONE CUP 1.496511e-02 5.381762e-02 0.0170609319
## 15:                 ITEM_ZIZZLES  PLUM 1.410018e-02 1.020220e-02 0.0067383513
## 16:                      REGION_KANSAS 1.285587e-02 2.211024e-02 0.0117562724
## 17:                       WEEK_OF_YEAR 1.082639e-02 4.355395e-02 0.0771326165
## 18:                  12SMALL 24ONE CUP 1.007761e-02 4.561881e-02 0.0183512545
## 19:                 REGION_CALI_NEVADA 9.864727e-03 1.202192e-02 0.0127598566
## 20: BRAND_HILL MOISTURE THRASHED APPLE 8.640680e-03 3.853981e-03 0.0087455197
## 21:                       REGION_NOCAL 8.421951e-03 1.689332e-02 0.0364157706
## 22:                    REGION_COLORADO 8.022730e-03 3.949575e-03 0.0149103943
## 23:                    CALORIC_SEGMENT 7.846218e-03 2.547401e-03 0.0068817204
## 24:                ITEM_ SMASH  SUNSET 7.362977e-03 2.615727e-03 0.0043010753
## 25:                  12SMALL 20ONE CUP 7.111921e-03 3.886452e-02 0.0124731183
## 26:                 BRAND_SINGLE GROUP 5.698619e-03 4.076979e-03 0.0028673835
## 27:         12SMALL 24ONE PLASTICS JUG 5.310337e-03 3.481718e-02 0.0106093190
## 28:                   REGION_NEWMEXICO 4.566905e-03 5.244769e-03 0.0063082437
## 29:                   12SMALL 6ONE CUP 4.553726e-03 2.236744e-02 0.0067383513
## 30:                     REGION_PRAIRIE 4.033372e-03 7.846924e-03 0.0087455197
## 31:                   REGION_DESERT_SW 3.336404e-03 1.188687e-03 0.0170609319
## 32:                   BRAND_DIET SMASH 3.194828e-03 7.460710e-04 0.0014336918
## 33:                       ITEM_GO-DAY  1.762084e-03 8.690429e-03 0.0031541219
## 34:                       BRAND_GO-DAY 5.654087e-04 4.401173e-03 0.0012903226
## 35:                    BRAND_FANTASMIC 4.786928e-04 6.941731e-03 0.0021505376
## 36: ITEM_FANTASMIC  CUSTARD APPLE PLUM 4.307299e-05 1.227971e-05 0.0005734767
##                                Feature         Gain        Cover    Frequency
xgb.plot.importance(importance_matrix = importance_matrix2)

if (!requireNamespace("pdp", quietly = TRUE)) install.packages("pdp")
if (!requireNamespace("xgboost", quietly = TRUE)) install.packages("xgboost")
library(pdp)
## 
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
## 
##     partial
library(xgboost)
pdp::partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features)
##    WEEK_OF_YEAR     yhat
## 1          1.00 42.33142
## 2          2.04 42.13614
## 3          3.08 42.09447
## 4          4.12 41.77467
## 5          5.16 41.66057
## 6          6.20 42.74848
## 7          7.24 42.75283
## 8          8.28 42.74383
## 9          9.32 42.74452
## 10        10.36 42.90974
## 11        11.40 42.98448
## 12        12.44 42.97560
## 13        13.48 45.00818
## 14        14.52 45.28822
## 15        15.56 45.22283
## 16        16.60 45.18938
## 17        17.64 45.19483
## 18        18.68 45.19402
## 19        19.72 46.63148
## 20        20.76 46.62565
## 21        21.80 46.84449
## 22        22.84 46.85041
## 23        23.88 46.84909
## 24        24.92 46.87301
## 25        25.96 46.88228
## 26        27.00 46.88350
## 27        28.04 46.87881
## 28        29.08 46.88468
## 29        30.12 46.88782
## 30        31.16 46.89002
## 31        32.20 46.93946
## 32        33.24 46.93441
## 33        34.28 46.91926
## 34        35.32 46.92555
## 35        36.36 49.52571
## 36        37.40 49.54256
## 37        38.44 49.17260
## 38        39.48 49.16802
## 39        40.52 49.18255
## 40        41.56 48.74465
## 41        42.60 48.30548
## 42        43.64 48.19900
## 43        44.68 48.13644
## 44        45.72 48.25533
## 45        46.76 48.24118
## 46        47.80 50.63079
## 47        48.84 43.79528
## 48        49.88 43.79441
## 49        50.92 44.78383
## 50        51.96 50.41952
## 51        53.00 47.26535
pd <- partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features, grid.resolution = 20)

# Default PDP
pdp1 <- plotPartial(pd, plot = TRUE)



# plot
grid.arrange(pdp1)

Based on the plum Energy Drink innovation datafram we expect the best 13 weeks to be between about weeks 33 and weeks 46.

# Set up
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, skimr, knitr, caret, readr, 
               ggplot2, dplyr, tidymodels, pROC, xgboost, doParallel, vip, DALEXtra, pdp, DALEX, gridExtra)
plum <- read_csv("plum_tableau.csv")
## Rows: 108886 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM, REGION, SEASON, PACK...
## dbl  (8): MARKET_KEY, CALORIC_SEGMENT, UNIT_SALES, DOLLAR_SALES, POP_SQMI, M...
## date (2): DATE, min_launch_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(plum)
## spc_tbl_ [108,886 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ MARKET_KEY          : num [1:108886] 1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE                : Date[1:108886], format: "2022-06-18" "2022-04-30" ...
##  $ CALORIC_SEGMENT     : num [1:108886] 1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY            : chr [1:108886] "SSD" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES          : num [1:108886] 1 14 18 13 19 4 29 35 75 25 ...
##  $ DOLLAR_SALES        : num [1:108886] 4.62 86.86 89.73 65.6 72.93 ...
##  $ MANUFACTURER        : chr [1:108886] "JOLLYS" "JOLLYS" "JOLLYS" "JOLLYS" ...
##  $ BRAND               : chr [1:108886] "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
##  $ PACKAGE             : chr [1:108886] "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
##  $ ITEM                : chr [1:108886] "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" ...
##  $ POP_SQMI            : num [1:108886] 1.2 1.2 1.2 1.2 1.2 ...
##  $ REGION              : chr [1:108886] "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ MONTH               : num [1:108886] 6 4 12 7 11 6 1 11 10 12 ...
##  $ SEASON              : chr [1:108886] "SUMMER" "SPRING" "WINTER" "SUMMER" ...
##  $ PACKAGE2            : chr [1:108886] "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
##  $ ENERGY_DRINK        : num [1:108886] 0 0 0 0 0 0 0 0 0 0 ...
##  $ CALORIC_SEGMENT_TEXT: chr [1:108886] NA NA NA NA ...
##  $ min_launch_date     : Date[1:108886], format: "2021-09-04" "2021-09-04" ...
##  $ WEEKS_SINCE_LAUNCH  : num [1:108886] 41 34 14 47 12 40 20 9 6 13 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   MARKET_KEY = col_double(),
##   ..   DATE = col_date(format = ""),
##   ..   CALORIC_SEGMENT = col_double(),
##   ..   CATEGORY = col_character(),
##   ..   UNIT_SALES = col_double(),
##   ..   DOLLAR_SALES = col_double(),
##   ..   MANUFACTURER = col_character(),
##   ..   BRAND = col_character(),
##   ..   PACKAGE = col_character(),
##   ..   ITEM = col_character(),
##   ..   POP_SQMI = col_double(),
##   ..   REGION = col_character(),
##   ..   MONTH = col_double(),
##   ..   SEASON = col_character(),
##   ..   PACKAGE2 = col_character(),
##   ..   ENERGY_DRINK = col_double(),
##   ..   CALORIC_SEGMENT_TEXT = col_character(),
##   ..   min_launch_date = col_date(format = ""),
##   ..   WEEKS_SINCE_LAUNCH = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#remove any rows that have PACKAGE containing JUG 
plum <- plum %>% filter(!str_detect(PACKAGE, "JUG"))

#remove any rows that have PACKAGE containing JUG 
plum <- plum %>% filter(!str_detect(PACKAGE2, "JUG"))

#remove any rows containing SMASH SUNSET
plum <- plum %>% filter(!str_detect(ITEM, "SMASH"))

# #remove any rows containing FANTASMIC
# plum <- plum %>% filter(!str_detect(ITEM, "FANTASMIC"))
# 
# #remove any rows containing GO DAY
# df <- df %>% filter(!str_detect(ITEM, "GO DAY"))

print(unique(df$ITEM))
## NULL
print(unique(df$PACKAGE))
## NULL
# Assuming you have already loaded your data into plum dataframe

# Remove the MARKET_KEY column
plum <- plum[, !names(plum) %in% "MARKET_KEY"]

# Identify categorical variables
categorical_cols <- c( "MANUFACTURER", "BRAND", "PACKAGE", "REGION", "SEASON")

# One-hot encode categorical variables
encoded_data <- dummyVars("~.", data = plum[categorical_cols], fullRank = TRUE)
encoded_data <- predict(encoded_data, newdata = plum[categorical_cols])

# Bind the encoded variables with the original dataframe
plum_encoded <- cbind(plum, encoded_data)

# Remove the original categorical variables
plum_encoded <- plum_encoded[, !(names(plum_encoded) %in% categorical_cols)]

# Now you can proceed with the modeling process
str(plum_encoded)
## 'data.frame':    43366 obs. of  33 variables:
##  $ DATE                             : Date, format: "2022-06-18" "2022-04-30" ...
##  $ CALORIC_SEGMENT                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY                         : chr  "SSD" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES                       : num  1 14 18 13 19 4 29 35 75 25 ...
##  $ DOLLAR_SALES                     : num  4.62 86.86 89.73 65.6 72.93 ...
##  $ ITEM                             : chr  "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" ...
##  $ POP_SQMI                         : num  1.2 1.2 1.2 1.2 1.2 ...
##  $ MONTH                            : num  6 4 12 7 11 6 1 11 10 12 ...
##  $ PACKAGE2                         : chr  "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
##  $ ENERGY_DRINK                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CALORIC_SEGMENT_TEXT             : chr  NA NA NA NA ...
##  $ min_launch_date                  : Date, format: "2021-09-04" "2021-09-04" ...
##  $ WEEKS_SINCE_LAUNCH               : num  41 34 14 47 12 40 20 9 6 13 ...
##  $ MANUFACTURERJOLLYS               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BRANDHILL MOISTURE THRASHED APPLE: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BRANDSINGLE GROUP                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PACKAGE12SMALL 18ONE CUP         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PACKAGE12SMALL 20ONE CUP         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PACKAGE12SMALL 24ONE CUP         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PACKAGE12SMALL 6ONE CUP          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONCALI_NEVADA                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONCOLORADO                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONDESERT_SW                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONKANSAS                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONMOUNTAIN                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONNEWMEXICO                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONNOCAL                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONNORTHERN                   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ REGIONPRAIRIE                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REGIONSOCAL                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEASONSPRING                     : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ SEASONSUMMER                     : num  1 0 0 1 0 1 0 0 0 0 ...
##  $ SEASONWINTER                     : num  0 0 1 0 0 0 1 0 0 1 ...
# List to store unique values for each variable
unique_values_list <- list()

# Columns to get unique values for
columns_to_get_unique_values <- c("BRAND", "PACKAGE", "ITEM", "REGION", "SEASON")

# Get unique values for each variable and store in the list
for (col in columns_to_get_unique_values) {
  unique_values_list[[col]] <- unique(plum[[col]])
}

# Loop over unique regions and create new columns
for (region in unique_values_list$REGION) {
  plum[[region]] <- as.integer(grepl(region, plum$REGION))
}

# Loop over unique brands and create new columns
for (brand in unique_values_list$BRAND) {
  plum[[brand]] <- as.integer(grepl(brand, plum$BRAND))
}

# Loop over unique brands and create new columns
for (item in unique_values_list$ITEM) {
  plum[[item]] <- as.integer(grepl(item, plum$ITEM))
}

# Loop over unique regions and create new columns
for (package in unique_values_list$PACKAGE) {
  plum[[package]] <- as.integer(grepl(package, plum$PACKAGE))
}

# Loop over unique regions and create new columns
for (season in unique_values_list$SEASON) {
  plum[[season]] <- as.integer(grepl(season, plum$SEASON))
}

plum <- plum[, !names(plum) == ""]



# Remove unnecessary columns
one_hot_plum <- plum %>%
  select(-CALORIC_SEGMENT, -CATEGORY, -MANUFACTURER, -BRAND, -REGION, -PACKAGE, -ITEM)

head(one_hot_plum)
## # A tibble: 6 × 37
##   DATE       UNIT_SALES DOLLAR_SALES POP_SQMI MONTH SEASON PACKAGE2 ENERGY_DRINK
##   <date>          <dbl>        <dbl>    <dbl> <dbl> <chr>  <chr>           <dbl>
## 1 2022-06-18          1         4.62     1.20     6 SUMMER CUP 12 …            0
## 2 2022-04-30         14        86.9      1.20     4 SPRING CUP 12 …            0
## 3 2021-12-11         18        89.7      1.20    12 WINTER CUP 12 …            0
## 4 2022-07-30         13        65.6      1.20     7 SUMMER CUP 12 …            0
## 5 2021-11-27         19        72.9      1.20    11 FALL   CUP 12 …            0
## 6 2022-06-11          4        25.6      1.20     6 SUMMER CUP 12 …            0
## # ℹ 29 more variables: CALORIC_SEGMENT_TEXT <chr>, min_launch_date <date>,
## #   WEEKS_SINCE_LAUNCH <dbl>, NORTHERN <int>, CALI_NEVADA <int>,
## #   DESERT_SW <int>, MOUNTAIN <int>, SOCAL <int>, PRAIRIE <int>, ARIZONA <int>,
## #   NEWMEXICO <int>, NOCAL <int>, COLORADO <int>, KANSAS <int>,
## #   `HILL MOISTURE THRASHED APPLE` <int>, `BEAUTIFUL GREENER` <int>,
## #   `SINGLE GROUP` <int>, `RAINING  THRASHED PLUM` <int>,
## #   `BEAUTIFUL GREENER  PLUM` <int>, `ZIZZLES  PLUM` <int>, …
write.csv(one_hot_plum, "one_hot_plum.csv", row.names = FALSE)

#cleanup all objects except one_hot_plum
#rm(list = setdiff(ls(), "one_hot_plum"))
# Load and prepare dataset
df1 <- read.csv("one_hot_plum.csv") 
df1 <- df1 %>% 
  select(-DATE, -MONTH, -WINTER, -SPRING, -FALL, -DOLLAR_SALES, -SUMMER, -SEASON)
# Summarize the dataset
skimr::skim(df1)
Data summary
Name df1
Number of rows 43366
Number of columns 29
_______________________
Column type frequency:
character 2
logical 1
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
PACKAGE2 0 1 22 26 0 5 0
min_launch_date 0 1 10 10 0 2 0

Variable type: logical

skim_variable n_missing complete_rate mean count
CALORIC_SEGMENT_TEXT 43366 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
UNIT_SALES 0 1 55.88 90.12 1.00 9.00 26.00 63.00 2716.00 ▇▁▁▁▁
POP_SQMI 0 1 1713.02 1965.76 0.18 52.08 836.76 3191.96 6769.35 ▇▂▂▁▁
ENERGY_DRINK 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
WEEKS_SINCE_LAUNCH 0 1 59.74 35.11 0.00 30.00 58.00 86.00 128.00 ▇▇▇▆▅
NORTHERN 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
CALI_NEVADA 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
DESERT_SW 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
MOUNTAIN 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
SOCAL 0 1 0.16 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
PRAIRIE 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ARIZONA 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
NEWMEXICO 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
NOCAL 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
COLORADO 0 1 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
KANSAS 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
HILL.MOISTURE.THRASHED.APPLE 0 1 0.34 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
BEAUTIFUL.GREENER 0 1 0.58 0.49 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
SINGLE.GROUP 0 1 0.08 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
RAINING..THRASHED.PLUM 0 1 0.34 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
BEAUTIFUL.GREENER..PLUM 0 1 0.58 0.49 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
ZIZZLES..PLUM 0 1 0.08 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
X12SMALL.12ONE.CUP 0 1 0.88 0.32 0.00 1.00 1.00 1.00 1.00 ▁▁▁▁▇
X12SMALL.6ONE.CUP 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
X12SMALL.24ONE.CUP 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
X12SMALL.20ONE.CUP 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
X12SMALL.18ONE.CUP 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
#Remove outliers in top 1% of Unit Sales. 
df1 <- df1 %>% filter(UNIT_SALES < quantile(UNIT_SALES, 0.99))
# Split the data
set.seed(123)
df_testtrn <- initial_split(df1, prop = 0.8, strata = UNIT_SALES)
Train <- training(df_testtrn)
Test <- testing(df_testtrn)

# Prepare features and labels for XGBoost
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES

train_features <- lapply(train_features, as.numeric)
## Warning in lapply(train_features, as.numeric): NAs introduced by coercion

## Warning in lapply(train_features, as.numeric): NAs introduced by coercion
train_labels <- as.numeric(train_labels)



train_features <- lapply(train_features, as.numeric)

# Convert the list to a matrix
train_matrix <- do.call(cbind, train_features)

# Create DMatrix
train_dmatrix <- xgb.DMatrix(data = train_matrix, label = train_labels)



test_features <- lapply(test_features, as.numeric)
## Warning in lapply(test_features, as.numeric): NAs introduced by coercion
## Warning in lapply(test_features, as.numeric): NAs introduced by coercion
# Convert the list to a matrix
test_matrix <- do.call(cbind, test_features)

# Create DMatrix
test_dmatrix <- xgb.DMatrix(data = test_matrix, label = test_labels)
# Define XGBoost parameters
set.seed(123)



params <- list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eval_metric = "rmse",
  eta = 0.05,
  max_depth = 3,
  min_child_weight = 1,
  subsample = 0.7,
  colsample_bytree = 0.6,
  reg_lambda = 1,
  reg_alpha = 0
)
# #use all cores except for one
# doParallel::registerDoParallel(cores = parallel::detectCores() - 1)
# Perform cross-validation to find the optimal number of boosting rounds
cv_results <- xgb.cv(
  params = params,
  data = train_dmatrix,  
  nfold = 5,
  nrounds = 500,  # Changed from 'num_boost_round' to 'nrounds'
  early_stopping_rounds = 10,
  metrics = "rmse",
  seed = 123
)
## [1]  train-rmse:81.630267+0.229463   test-rmse:81.626172+0.677657 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [2]  train-rmse:79.881361+0.214846   test-rmse:79.871352+0.708257 
## [3]  train-rmse:78.272570+0.223207   test-rmse:78.259611+0.724948 
## [4]  train-rmse:76.823268+0.220764   test-rmse:76.809766+0.713121 
## [5]  train-rmse:75.485726+0.263909   test-rmse:75.475546+0.668567 
## [6]  train-rmse:74.292984+0.212330   test-rmse:74.288742+0.693536 
## [7]  train-rmse:73.127865+0.209287   test-rmse:73.129229+0.710464 
## [8]  train-rmse:72.043028+0.174393   test-rmse:72.043241+0.769906 
## [9]  train-rmse:71.080869+0.169583   test-rmse:71.082009+0.790404 
## [10] train-rmse:70.164555+0.190074   test-rmse:70.167679+0.758776 
## [11] train-rmse:69.370928+0.240068   test-rmse:69.375989+0.722733 
## [12] train-rmse:68.589532+0.273396   test-rmse:68.594246+0.662321 
## [13] train-rmse:67.829737+0.313070   test-rmse:67.837536+0.657090 
## [14] train-rmse:67.158882+0.261371   test-rmse:67.165648+0.670325 
## [15] train-rmse:66.592793+0.226190   test-rmse:66.601380+0.677492 
## [16] train-rmse:66.031029+0.253788   test-rmse:66.049014+0.669672 
## [17] train-rmse:65.488243+0.220998   test-rmse:65.505498+0.741396 
## [18] train-rmse:65.014802+0.198313   test-rmse:65.035193+0.735735 
## [19] train-rmse:64.557788+0.201890   test-rmse:64.580867+0.718667 
## [20] train-rmse:64.170577+0.200613   test-rmse:64.196890+0.731885 
## [21] train-rmse:63.783276+0.214663   test-rmse:63.802089+0.659697 
## [22] train-rmse:63.441278+0.258247   test-rmse:63.456047+0.643660 
## [23] train-rmse:63.093087+0.289292   test-rmse:63.109051+0.621254 
## [24] train-rmse:62.756751+0.298107   test-rmse:62.780768+0.590581 
## [25] train-rmse:62.478861+0.286614   test-rmse:62.505645+0.566679 
## [26] train-rmse:62.198562+0.255273   test-rmse:62.228255+0.579993 
## [27] train-rmse:61.948781+0.264637   test-rmse:61.986840+0.583754 
## [28] train-rmse:61.690860+0.260911   test-rmse:61.727381+0.586151 
## [29] train-rmse:61.479941+0.271026   test-rmse:61.528549+0.589573 
## [30] train-rmse:61.268712+0.293149   test-rmse:61.316958+0.585824 
## [31] train-rmse:61.107641+0.293732   test-rmse:61.153426+0.561179 
## [32] train-rmse:60.932865+0.309358   test-rmse:60.980867+0.536239 
## [33] train-rmse:60.752950+0.285379   test-rmse:60.798071+0.539515 
## [34] train-rmse:60.588747+0.289656   test-rmse:60.624855+0.555171 
## [35] train-rmse:60.460900+0.293315   test-rmse:60.500084+0.543081 
## [36] train-rmse:60.291709+0.315158   test-rmse:60.339228+0.531428 
## [37] train-rmse:60.147875+0.330357   test-rmse:60.193080+0.530971 
## [38] train-rmse:60.006961+0.342518   test-rmse:60.056798+0.560064 
## [39] train-rmse:59.882822+0.318971   test-rmse:59.935740+0.542633 
## [40] train-rmse:59.741222+0.346142   test-rmse:59.795272+0.517427 
## [41] train-rmse:59.657343+0.351221   test-rmse:59.711949+0.494237 
## [42] train-rmse:59.566380+0.366305   test-rmse:59.621232+0.518428 
## [43] train-rmse:59.457046+0.317887   test-rmse:59.514116+0.538085 
## [44] train-rmse:59.356065+0.304824   test-rmse:59.411539+0.549573 
## [45] train-rmse:59.244664+0.326972   test-rmse:59.303286+0.539908 
## [46] train-rmse:59.168666+0.353693   test-rmse:59.228497+0.558409 
## [47] train-rmse:59.019725+0.290506   test-rmse:59.084133+0.563688 
## [48] train-rmse:58.945742+0.286975   test-rmse:59.012749+0.556246 
## [49] train-rmse:58.851308+0.269773   test-rmse:58.917844+0.569225 
## [50] train-rmse:58.775141+0.268298   test-rmse:58.839237+0.547050 
## [51] train-rmse:58.700958+0.281126   test-rmse:58.767071+0.567100 
## [52] train-rmse:58.640054+0.278511   test-rmse:58.709655+0.567117 
## [53] train-rmse:58.550649+0.273418   test-rmse:58.617611+0.592622 
## [54] train-rmse:58.477617+0.280776   test-rmse:58.544208+0.574535 
## [55] train-rmse:58.407846+0.302689   test-rmse:58.471429+0.572017 
## [56] train-rmse:58.356404+0.321335   test-rmse:58.423785+0.587572 
## [57] train-rmse:58.290028+0.294739   test-rmse:58.360798+0.589571 
## [58] train-rmse:58.175656+0.295934   test-rmse:58.251539+0.552968 
## [59] train-rmse:58.067886+0.260151   test-rmse:58.147153+0.587540 
## [60] train-rmse:58.001803+0.256351   test-rmse:58.081751+0.584127 
## [61] train-rmse:57.932900+0.247721   test-rmse:58.010225+0.595180 
## [62] train-rmse:57.875039+0.251259   test-rmse:57.954666+0.611563 
## [63] train-rmse:57.827711+0.251043   test-rmse:57.910608+0.615903 
## [64] train-rmse:57.758052+0.279961   test-rmse:57.843716+0.598654 
## [65] train-rmse:57.681478+0.257849   test-rmse:57.767914+0.631803 
## [66] train-rmse:57.637336+0.255986   test-rmse:57.723125+0.643561 
## [67] train-rmse:57.533389+0.190269   test-rmse:57.614456+0.607521 
## [68] train-rmse:57.479300+0.191485   test-rmse:57.557656+0.620173 
## [69] train-rmse:57.410259+0.199307   test-rmse:57.490650+0.615157 
## [70] train-rmse:57.360313+0.202300   test-rmse:57.441984+0.625113 
## [71] train-rmse:57.271802+0.215720   test-rmse:57.361508+0.636964 
## [72] train-rmse:57.209691+0.213607   test-rmse:57.300893+0.666422 
## [73] train-rmse:57.174996+0.214229   test-rmse:57.267039+0.662422 
## [74] train-rmse:57.153483+0.213064   test-rmse:57.245148+0.663866 
## [75] train-rmse:57.096608+0.229833   test-rmse:57.191958+0.647759 
## [76] train-rmse:57.033672+0.215826   test-rmse:57.126512+0.638247 
## [77] train-rmse:56.970746+0.175763   test-rmse:57.061061+0.649533 
## [78] train-rmse:56.901679+0.207057   test-rmse:56.994932+0.657498 
## [79] train-rmse:56.833526+0.177909   test-rmse:56.920276+0.647096 
## [80] train-rmse:56.762319+0.202860   test-rmse:56.849101+0.686820 
## [81] train-rmse:56.739632+0.208370   test-rmse:56.827357+0.693597 
## [82] train-rmse:56.648016+0.234231   test-rmse:56.740654+0.691097 
## [83] train-rmse:56.600933+0.249074   test-rmse:56.698080+0.685679 
## [84] train-rmse:56.571267+0.243906   test-rmse:56.666838+0.700942 
## [85] train-rmse:56.507834+0.246624   test-rmse:56.609482+0.693862 
## [86] train-rmse:56.452263+0.248738   test-rmse:56.551870+0.701024 
## [87] train-rmse:56.413453+0.263145   test-rmse:56.517679+0.702240 
## [88] train-rmse:56.359159+0.265078   test-rmse:56.466690+0.728637 
## [89] train-rmse:56.309135+0.250641   test-rmse:56.419189+0.728227 
## [90] train-rmse:56.226592+0.225650   test-rmse:56.338204+0.662713 
## [91] train-rmse:56.147147+0.182210   test-rmse:56.256643+0.670402 
## [92] train-rmse:56.076532+0.191742   test-rmse:56.186993+0.674354 
## [93] train-rmse:56.046896+0.186431   test-rmse:56.156393+0.672504 
## [94] train-rmse:56.007977+0.199387   test-rmse:56.117867+0.682580 
## [95] train-rmse:55.960778+0.210375   test-rmse:56.072375+0.702053 
## [96] train-rmse:55.917261+0.187397   test-rmse:56.024519+0.703274 
## [97] train-rmse:55.871236+0.198597   test-rmse:55.976201+0.718171 
## [98] train-rmse:55.839504+0.198780   test-rmse:55.946158+0.726819 
## [99] train-rmse:55.809102+0.194073   test-rmse:55.917547+0.736123 
## [100]    train-rmse:55.763872+0.192004   test-rmse:55.876098+0.730846 
## [101]    train-rmse:55.731571+0.191632   test-rmse:55.841217+0.740818 
## [102]    train-rmse:55.695517+0.217581   test-rmse:55.807055+0.736365 
## [103]    train-rmse:55.624197+0.253877   test-rmse:55.738564+0.772335 
## [104]    train-rmse:55.559861+0.268020   test-rmse:55.682832+0.786981 
## [105]    train-rmse:55.519325+0.247517   test-rmse:55.645350+0.781678 
## [106]    train-rmse:55.484492+0.252326   test-rmse:55.615998+0.787121 
## [107]    train-rmse:55.456025+0.224627   test-rmse:55.586589+0.761211 
## [108]    train-rmse:55.425135+0.239484   test-rmse:55.554160+0.767244 
## [109]    train-rmse:55.372286+0.228401   test-rmse:55.505225+0.767243 
## [110]    train-rmse:55.336798+0.224742   test-rmse:55.469656+0.755692 
## [111]    train-rmse:55.279331+0.192203   test-rmse:55.407375+0.734264 
## [112]    train-rmse:55.217986+0.204361   test-rmse:55.349583+0.746905 
## [113]    train-rmse:55.174985+0.230868   test-rmse:55.307083+0.774874 
## [114]    train-rmse:55.141846+0.244563   test-rmse:55.278286+0.788674 
## [115]    train-rmse:55.085143+0.216153   test-rmse:55.222362+0.764833 
## [116]    train-rmse:55.024239+0.233182   test-rmse:55.166613+0.775983 
## [117]    train-rmse:54.991152+0.239653   test-rmse:55.133260+0.781301 
## [118]    train-rmse:54.962796+0.244577   test-rmse:55.106155+0.782663 
## [119]    train-rmse:54.926258+0.246865   test-rmse:55.075595+0.781010 
## [120]    train-rmse:54.867396+0.250801   test-rmse:55.020095+0.781422 
## [121]    train-rmse:54.844965+0.259237   test-rmse:55.001040+0.790954 
## [122]    train-rmse:54.821032+0.264449   test-rmse:54.980359+0.797478 
## [123]    train-rmse:54.786947+0.255252   test-rmse:54.946311+0.790867 
## [124]    train-rmse:54.762009+0.252717   test-rmse:54.922246+0.785393 
## [125]    train-rmse:54.718246+0.225357   test-rmse:54.878878+0.766525 
## [126]    train-rmse:54.684889+0.231414   test-rmse:54.852156+0.754337 
## [127]    train-rmse:54.641425+0.249492   test-rmse:54.810879+0.768981 
## [128]    train-rmse:54.609445+0.238509   test-rmse:54.785242+0.775092 
## [129]    train-rmse:54.581857+0.243227   test-rmse:54.760809+0.768996 
## [130]    train-rmse:54.536256+0.257385   test-rmse:54.718157+0.745436 
## [131]    train-rmse:54.509029+0.261115   test-rmse:54.687466+0.735486 
## [132]    train-rmse:54.475014+0.234828   test-rmse:54.646354+0.692035 
## [133]    train-rmse:54.429854+0.208334   test-rmse:54.602112+0.694216 
## [134]    train-rmse:54.367643+0.193179   test-rmse:54.535427+0.690863 
## [135]    train-rmse:54.331437+0.182788   test-rmse:54.499966+0.676328 
## [136]    train-rmse:54.300474+0.175121   test-rmse:54.472845+0.674958 
## [137]    train-rmse:54.276207+0.188594   test-rmse:54.450935+0.685443 
## [138]    train-rmse:54.246681+0.192243   test-rmse:54.424247+0.683780 
## [139]    train-rmse:54.157299+0.184642   test-rmse:54.334394+0.677182 
## [140]    train-rmse:54.106344+0.198444   test-rmse:54.280891+0.649584 
## [141]    train-rmse:54.056257+0.183672   test-rmse:54.235390+0.655390 
## [142]    train-rmse:54.029176+0.177656   test-rmse:54.212888+0.660211 
## [143]    train-rmse:53.998538+0.164630   test-rmse:54.181777+0.637161 
## [144]    train-rmse:53.959065+0.166354   test-rmse:54.140648+0.639510 
## [145]    train-rmse:53.905649+0.173966   test-rmse:54.087415+0.630048 
## [146]    train-rmse:53.888002+0.171069   test-rmse:54.069365+0.635489 
## [147]    train-rmse:53.843474+0.154882   test-rmse:54.019988+0.597519 
## [148]    train-rmse:53.799445+0.151965   test-rmse:53.981716+0.615149 
## [149]    train-rmse:53.771360+0.139799   test-rmse:53.954076+0.625219 
## [150]    train-rmse:53.732207+0.128550   test-rmse:53.910070+0.604666 
## [151]    train-rmse:53.699590+0.123948   test-rmse:53.881789+0.601795 
## [152]    train-rmse:53.667627+0.130865   test-rmse:53.852119+0.616736 
## [153]    train-rmse:53.634653+0.119205   test-rmse:53.820714+0.625430 
## [154]    train-rmse:53.587645+0.142444   test-rmse:53.769894+0.620149 
## [155]    train-rmse:53.540948+0.143567   test-rmse:53.722554+0.629343 
## [156]    train-rmse:53.487746+0.137733   test-rmse:53.669454+0.644294 
## [157]    train-rmse:53.465815+0.131668   test-rmse:53.648748+0.639706 
## [158]    train-rmse:53.419272+0.137845   test-rmse:53.605304+0.631079 
## [159]    train-rmse:53.368737+0.151506   test-rmse:53.553924+0.631673 
## [160]    train-rmse:53.308563+0.138651   test-rmse:53.493494+0.582431 
## [161]    train-rmse:53.259921+0.139940   test-rmse:53.450387+0.592862 
## [162]    train-rmse:53.223558+0.126198   test-rmse:53.414197+0.587881 
## [163]    train-rmse:53.198955+0.133516   test-rmse:53.389879+0.581505 
## [164]    train-rmse:53.162419+0.124220   test-rmse:53.355610+0.571083 
## [165]    train-rmse:53.131560+0.129566   test-rmse:53.327911+0.579513 
## [166]    train-rmse:53.106261+0.118362   test-rmse:53.298729+0.545948 
## [167]    train-rmse:53.068000+0.153167   test-rmse:53.264944+0.530046 
## [168]    train-rmse:53.015458+0.159450   test-rmse:53.209961+0.538813 
## [169]    train-rmse:52.983381+0.158104   test-rmse:53.180160+0.545268 
## [170]    train-rmse:52.959562+0.150463   test-rmse:53.158061+0.527766 
## [171]    train-rmse:52.935396+0.142959   test-rmse:53.131941+0.545097 
## [172]    train-rmse:52.886603+0.166744   test-rmse:53.084112+0.528699 
## [173]    train-rmse:52.866571+0.165255   test-rmse:53.064859+0.512178 
## [174]    train-rmse:52.804424+0.155451   test-rmse:52.998700+0.509459 
## [175]    train-rmse:52.770706+0.175907   test-rmse:52.965344+0.512573 
## [176]    train-rmse:52.757397+0.170585   test-rmse:52.953940+0.508158 
## [177]    train-rmse:52.730016+0.157395   test-rmse:52.927338+0.511229 
## [178]    train-rmse:52.682640+0.157329   test-rmse:52.877060+0.522447 
## [179]    train-rmse:52.648153+0.152243   test-rmse:52.845066+0.498917 
## [180]    train-rmse:52.623483+0.143623   test-rmse:52.820560+0.499567 
## [181]    train-rmse:52.586091+0.170753   test-rmse:52.784392+0.487405 
## [182]    train-rmse:52.552466+0.150088   test-rmse:52.754478+0.500995 
## [183]    train-rmse:52.529281+0.151226   test-rmse:52.731807+0.506121 
## [184]    train-rmse:52.491531+0.153825   test-rmse:52.695661+0.470661 
## [185]    train-rmse:52.435932+0.184135   test-rmse:52.640909+0.452383 
## [186]    train-rmse:52.413296+0.199402   test-rmse:52.619422+0.446946 
## [187]    train-rmse:52.346619+0.208648   test-rmse:52.559863+0.447871 
## [188]    train-rmse:52.319457+0.223336   test-rmse:52.529680+0.445157 
## [189]    train-rmse:52.284153+0.217766   test-rmse:52.496169+0.451123 
## [190]    train-rmse:52.259389+0.205419   test-rmse:52.471109+0.463142 
## [191]    train-rmse:52.209733+0.217047   test-rmse:52.421514+0.461682 
## [192]    train-rmse:52.192552+0.216426   test-rmse:52.404620+0.450037 
## [193]    train-rmse:52.155634+0.209588   test-rmse:52.370776+0.470523 
## [194]    train-rmse:52.115550+0.202560   test-rmse:52.334571+0.499604 
## [195]    train-rmse:52.092087+0.205112   test-rmse:52.312068+0.487624 
## [196]    train-rmse:52.059447+0.207383   test-rmse:52.284108+0.513159 
## [197]    train-rmse:52.020032+0.204224   test-rmse:52.252744+0.514555 
## [198]    train-rmse:52.009217+0.205728   test-rmse:52.241914+0.521988 
## [199]    train-rmse:52.003194+0.206391   test-rmse:52.236938+0.521398 
## [200]    train-rmse:51.962240+0.237688   test-rmse:52.192219+0.529822 
## [201]    train-rmse:51.946828+0.249863   test-rmse:52.175337+0.534000 
## [202]    train-rmse:51.914253+0.264252   test-rmse:52.148062+0.523497 
## [203]    train-rmse:51.896456+0.249079   test-rmse:52.130681+0.519582 
## [204]    train-rmse:51.866326+0.260130   test-rmse:52.102796+0.503687 
## [205]    train-rmse:51.841111+0.255080   test-rmse:52.076538+0.510993 
## [206]    train-rmse:51.814327+0.246788   test-rmse:52.049333+0.515875 
## [207]    train-rmse:51.799398+0.244131   test-rmse:52.035073+0.523803 
## [208]    train-rmse:51.771106+0.231015   test-rmse:52.012317+0.528578 
## [209]    train-rmse:51.749950+0.232870   test-rmse:51.992807+0.540014 
## [210]    train-rmse:51.726653+0.251009   test-rmse:51.971759+0.541791 
## [211]    train-rmse:51.682140+0.262865   test-rmse:51.933076+0.569675 
## [212]    train-rmse:51.671846+0.261094   test-rmse:51.923722+0.570758 
## [213]    train-rmse:51.617854+0.255221   test-rmse:51.867466+0.545813 
## [214]    train-rmse:51.586885+0.279584   test-rmse:51.835227+0.561485 
## [215]    train-rmse:51.550567+0.258969   test-rmse:51.798920+0.548652 
## [216]    train-rmse:51.536433+0.260029   test-rmse:51.788176+0.549711 
## [217]    train-rmse:51.512754+0.253103   test-rmse:51.769537+0.560652 
## [218]    train-rmse:51.479070+0.247629   test-rmse:51.735128+0.578009 
## [219]    train-rmse:51.392100+0.265619   test-rmse:51.649909+0.578546 
## [220]    train-rmse:51.365021+0.278659   test-rmse:51.621507+0.595252 
## [221]    train-rmse:51.352615+0.283549   test-rmse:51.610475+0.593393 
## [222]    train-rmse:51.304614+0.291410   test-rmse:51.557559+0.601681 
## [223]    train-rmse:51.294926+0.289357   test-rmse:51.548150+0.604253 
## [224]    train-rmse:51.282344+0.293292   test-rmse:51.536923+0.610662 
## [225]    train-rmse:51.265122+0.286985   test-rmse:51.521889+0.608402 
## [226]    train-rmse:51.207154+0.283790   test-rmse:51.467357+0.633032 
## [227]    train-rmse:51.176420+0.270235   test-rmse:51.440416+0.627965 
## [228]    train-rmse:51.152071+0.272282   test-rmse:51.413688+0.631166 
## [229]    train-rmse:51.093520+0.292179   test-rmse:51.360106+0.630020 
## [230]    train-rmse:51.060541+0.305285   test-rmse:51.326923+0.652909 
## [231]    train-rmse:51.011456+0.321436   test-rmse:51.284448+0.653974 
## [232]    train-rmse:50.988841+0.309681   test-rmse:51.261488+0.632388 
## [233]    train-rmse:50.969569+0.303568   test-rmse:51.243048+0.614300 
## [234]    train-rmse:50.943983+0.301473   test-rmse:51.215808+0.608908 
## [235]    train-rmse:50.922203+0.307744   test-rmse:51.193374+0.620012 
## [236]    train-rmse:50.860435+0.285054   test-rmse:51.130823+0.610779 
## [237]    train-rmse:50.834252+0.304128   test-rmse:51.103216+0.632782 
## [238]    train-rmse:50.799644+0.318413   test-rmse:51.072520+0.623841 
## [239]    train-rmse:50.778742+0.308479   test-rmse:51.052386+0.624047 
## [240]    train-rmse:50.750556+0.319797   test-rmse:51.027065+0.642584 
## [241]    train-rmse:50.722183+0.335080   test-rmse:50.999459+0.654873 
## [242]    train-rmse:50.688058+0.333081   test-rmse:50.971423+0.661332 
## [243]    train-rmse:50.655667+0.332073   test-rmse:50.934046+0.655906 
## [244]    train-rmse:50.633679+0.335648   test-rmse:50.913231+0.661264 
## [245]    train-rmse:50.597141+0.330307   test-rmse:50.878455+0.671071 
## [246]    train-rmse:50.577970+0.337149   test-rmse:50.862143+0.678568 
## [247]    train-rmse:50.542562+0.355844   test-rmse:50.829991+0.698754 
## [248]    train-rmse:50.517680+0.347116   test-rmse:50.805375+0.683405 
## [249]    train-rmse:50.472970+0.337021   test-rmse:50.767970+0.676634 
## [250]    train-rmse:50.451522+0.339911   test-rmse:50.745561+0.674978 
## [251]    train-rmse:50.432129+0.340671   test-rmse:50.727929+0.674892 
## [252]    train-rmse:50.417412+0.337981   test-rmse:50.713877+0.673192 
## [253]    train-rmse:50.394618+0.349142   test-rmse:50.691315+0.664828 
## [254]    train-rmse:50.343162+0.369974   test-rmse:50.645195+0.688738 
## [255]    train-rmse:50.330032+0.368420   test-rmse:50.630270+0.687412 
## [256]    train-rmse:50.295396+0.366108   test-rmse:50.599215+0.685545 
## [257]    train-rmse:50.267211+0.383087   test-rmse:50.572509+0.697634 
## [258]    train-rmse:50.244212+0.399685   test-rmse:50.551951+0.699466 
## [259]    train-rmse:50.228644+0.385239   test-rmse:50.536837+0.693481 
## [260]    train-rmse:50.208809+0.377760   test-rmse:50.517713+0.686768 
## [261]    train-rmse:50.158949+0.384672   test-rmse:50.475248+0.682131 
## [262]    train-rmse:50.133027+0.378739   test-rmse:50.451532+0.685988 
## [263]    train-rmse:50.101055+0.380445   test-rmse:50.420908+0.697885 
## [264]    train-rmse:50.070740+0.400868   test-rmse:50.389424+0.706917 
## [265]    train-rmse:50.049657+0.412831   test-rmse:50.369297+0.709664 
## [266]    train-rmse:50.016947+0.410031   test-rmse:50.337030+0.696831 
## [267]    train-rmse:49.983327+0.411823   test-rmse:50.307918+0.686903 
## [268]    train-rmse:49.960177+0.414932   test-rmse:50.287154+0.686481 
## [269]    train-rmse:49.927445+0.395192   test-rmse:50.254636+0.671212 
## [270]    train-rmse:49.900721+0.412398   test-rmse:50.227456+0.674961 
## [271]    train-rmse:49.870063+0.400849   test-rmse:50.204033+0.678661 
## [272]    train-rmse:49.825654+0.382584   test-rmse:50.157693+0.687798 
## [273]    train-rmse:49.811453+0.390097   test-rmse:50.144145+0.687643 
## [274]    train-rmse:49.785182+0.393194   test-rmse:50.119111+0.674186 
## [275]    train-rmse:49.762545+0.402275   test-rmse:50.097474+0.690686 
## [276]    train-rmse:49.740347+0.399995   test-rmse:50.077029+0.694611 
## [277]    train-rmse:49.719806+0.398144   test-rmse:50.060300+0.696424 
## [278]    train-rmse:49.687613+0.393686   test-rmse:50.031561+0.690431 
## [279]    train-rmse:49.662784+0.362996   test-rmse:50.003932+0.674240 
## [280]    train-rmse:49.635479+0.346225   test-rmse:49.975956+0.675142 
## [281]    train-rmse:49.584697+0.336221   test-rmse:49.920168+0.637033 
## [282]    train-rmse:49.562219+0.318312   test-rmse:49.901838+0.633121 
## [283]    train-rmse:49.527981+0.306058   test-rmse:49.866872+0.623275 
## [284]    train-rmse:49.508551+0.312345   test-rmse:49.846062+0.618571 
## [285]    train-rmse:49.492505+0.322860   test-rmse:49.830495+0.616734 
## [286]    train-rmse:49.474078+0.314812   test-rmse:49.812887+0.603190 
## [287]    train-rmse:49.443111+0.298315   test-rmse:49.783119+0.596401 
## [288]    train-rmse:49.429506+0.289426   test-rmse:49.769931+0.591344 
## [289]    train-rmse:49.411895+0.292062   test-rmse:49.757387+0.597523 
## [290]    train-rmse:49.389354+0.294248   test-rmse:49.738389+0.611756 
## [291]    train-rmse:49.368772+0.297795   test-rmse:49.719640+0.622860 
## [292]    train-rmse:49.340714+0.283909   test-rmse:49.692721+0.618444 
## [293]    train-rmse:49.320961+0.278823   test-rmse:49.674466+0.607283 
## [294]    train-rmse:49.314610+0.282697   test-rmse:49.668728+0.612377 
## [295]    train-rmse:49.300537+0.277849   test-rmse:49.656495+0.608479 
## [296]    train-rmse:49.286605+0.283280   test-rmse:49.643806+0.605509 
## [297]    train-rmse:49.275747+0.277101   test-rmse:49.634159+0.596675 
## [298]    train-rmse:49.229699+0.281668   test-rmse:49.596139+0.574301 
## [299]    train-rmse:49.209023+0.291725   test-rmse:49.573944+0.571933 
## [300]    train-rmse:49.164716+0.293815   test-rmse:49.531079+0.578749 
## [301]    train-rmse:49.146106+0.285626   test-rmse:49.513206+0.558163 
## [302]    train-rmse:49.136925+0.283836   test-rmse:49.503960+0.561784 
## [303]    train-rmse:49.109552+0.283844   test-rmse:49.476731+0.564421 
## [304]    train-rmse:49.073856+0.257565   test-rmse:49.437303+0.537131 
## [305]    train-rmse:49.049789+0.265613   test-rmse:49.408540+0.542514 
## [306]    train-rmse:49.029791+0.255780   test-rmse:49.387674+0.533429 
## [307]    train-rmse:49.021435+0.251833   test-rmse:49.379007+0.527744 
## [308]    train-rmse:49.007050+0.248746   test-rmse:49.365310+0.525535 
## [309]    train-rmse:48.997660+0.246208   test-rmse:49.358204+0.524078 
## [310]    train-rmse:48.986972+0.241315   test-rmse:49.346693+0.529508 
## [311]    train-rmse:48.976488+0.238269   test-rmse:49.338820+0.533055 
## [312]    train-rmse:48.957183+0.230884   test-rmse:49.321145+0.531541 
## [313]    train-rmse:48.917265+0.224773   test-rmse:49.284210+0.527102 
## [314]    train-rmse:48.890188+0.245408   test-rmse:49.253327+0.540031 
## [315]    train-rmse:48.863295+0.225973   test-rmse:49.231574+0.521921 
## [316]    train-rmse:48.821355+0.207980   test-rmse:49.191054+0.521142 
## [317]    train-rmse:48.808292+0.196583   test-rmse:49.176090+0.517757 
## [318]    train-rmse:48.772804+0.197224   test-rmse:49.140060+0.525197 
## [319]    train-rmse:48.746310+0.219849   test-rmse:49.116804+0.530430 
## [320]    train-rmse:48.723188+0.223478   test-rmse:49.095551+0.549647 
## [321]    train-rmse:48.693346+0.212870   test-rmse:49.069517+0.545191 
## [322]    train-rmse:48.675566+0.222117   test-rmse:49.052229+0.546813 
## [323]    train-rmse:48.632635+0.215210   test-rmse:49.005653+0.545449 
## [324]    train-rmse:48.614469+0.212986   test-rmse:48.987033+0.557075 
## [325]    train-rmse:48.585699+0.221631   test-rmse:48.958753+0.543703 
## [326]    train-rmse:48.572150+0.221539   test-rmse:48.946046+0.551685 
## [327]    train-rmse:48.533051+0.244693   test-rmse:48.912639+0.551135 
## [328]    train-rmse:48.508351+0.247601   test-rmse:48.888423+0.542116 
## [329]    train-rmse:48.496827+0.249885   test-rmse:48.877426+0.543831 
## [330]    train-rmse:48.473542+0.256217   test-rmse:48.854418+0.534374 
## [331]    train-rmse:48.435336+0.269650   test-rmse:48.815055+0.546765 
## [332]    train-rmse:48.398496+0.270693   test-rmse:48.783446+0.568091 
## [333]    train-rmse:48.384039+0.270039   test-rmse:48.768703+0.559819 
## [334]    train-rmse:48.352772+0.267414   test-rmse:48.740104+0.582179 
## [335]    train-rmse:48.317041+0.278392   test-rmse:48.700070+0.582745 
## [336]    train-rmse:48.271014+0.263634   test-rmse:48.653796+0.581571 
## [337]    train-rmse:48.243069+0.262032   test-rmse:48.624698+0.574850 
## [338]    train-rmse:48.206546+0.256782   test-rmse:48.582775+0.593158 
## [339]    train-rmse:48.173983+0.235422   test-rmse:48.549086+0.570323 
## [340]    train-rmse:48.126806+0.243896   test-rmse:48.504181+0.585975 
## [341]    train-rmse:48.107737+0.232417   test-rmse:48.487309+0.580565 
## [342]    train-rmse:48.101207+0.236045   test-rmse:48.480873+0.582545 
## [343]    train-rmse:48.085998+0.229913   test-rmse:48.466236+0.581520 
## [344]    train-rmse:48.049837+0.227344   test-rmse:48.433509+0.587152 
## [345]    train-rmse:48.043540+0.228144   test-rmse:48.426248+0.591686 
## [346]    train-rmse:48.026740+0.233056   test-rmse:48.409055+0.600257 
## [347]    train-rmse:48.004036+0.240722   test-rmse:48.390861+0.606977 
## [348]    train-rmse:47.967174+0.235595   test-rmse:48.350300+0.595788 
## [349]    train-rmse:47.932734+0.235434   test-rmse:48.314904+0.571911 
## [350]    train-rmse:47.923112+0.237971   test-rmse:48.305856+0.579309 
## [351]    train-rmse:47.902938+0.242207   test-rmse:48.286165+0.589381 
## [352]    train-rmse:47.893122+0.239086   test-rmse:48.276899+0.587128 
## [353]    train-rmse:47.869678+0.240972   test-rmse:48.251558+0.573653 
## [354]    train-rmse:47.852990+0.248541   test-rmse:48.233590+0.573416 
## [355]    train-rmse:47.835669+0.250742   test-rmse:48.216615+0.568587 
## [356]    train-rmse:47.820683+0.257695   test-rmse:48.202918+0.580424 
## [357]    train-rmse:47.793758+0.243801   test-rmse:48.176515+0.569697 
## [358]    train-rmse:47.778933+0.249065   test-rmse:48.163044+0.579987 
## [359]    train-rmse:47.756398+0.251679   test-rmse:48.141452+0.568959 
## [360]    train-rmse:47.739446+0.249716   test-rmse:48.126617+0.562338 
## [361]    train-rmse:47.699373+0.231294   test-rmse:48.082723+0.545079 
## [362]    train-rmse:47.692692+0.229454   test-rmse:48.078347+0.540821 
## [363]    train-rmse:47.659250+0.234960   test-rmse:48.048442+0.549846 
## [364]    train-rmse:47.646098+0.227668   test-rmse:48.037004+0.554563 
## [365]    train-rmse:47.632850+0.233728   test-rmse:48.026209+0.558965 
## [366]    train-rmse:47.611175+0.227877   test-rmse:48.005803+0.561262 
## [367]    train-rmse:47.586992+0.218765   test-rmse:47.979775+0.555790 
## [368]    train-rmse:47.581754+0.219867   test-rmse:47.976356+0.556308 
## [369]    train-rmse:47.553657+0.227884   test-rmse:47.952426+0.566285 
## [370]    train-rmse:47.521702+0.216086   test-rmse:47.918669+0.550609 
## [371]    train-rmse:47.518039+0.215830   test-rmse:47.915419+0.548730 
## [372]    train-rmse:47.496125+0.225382   test-rmse:47.892706+0.563828 
## [373]    train-rmse:47.468260+0.212337   test-rmse:47.868168+0.564246 
## [374]    train-rmse:47.448251+0.206885   test-rmse:47.849864+0.569064 
## [375]    train-rmse:47.436414+0.203197   test-rmse:47.838811+0.575176 
## [376]    train-rmse:47.424658+0.201318   test-rmse:47.829083+0.568163 
## [377]    train-rmse:47.405887+0.192650   test-rmse:47.812922+0.572107 
## [378]    train-rmse:47.386599+0.180465   test-rmse:47.793985+0.558789 
## [379]    train-rmse:47.349698+0.186669   test-rmse:47.756111+0.553993 
## [380]    train-rmse:47.334275+0.187652   test-rmse:47.740899+0.560318 
## [381]    train-rmse:47.323605+0.187102   test-rmse:47.732846+0.552433 
## [382]    train-rmse:47.304820+0.195710   test-rmse:47.716014+0.565000 
## [383]    train-rmse:47.277525+0.198630   test-rmse:47.691018+0.560168 
## [384]    train-rmse:47.251091+0.212795   test-rmse:47.665132+0.562658 
## [385]    train-rmse:47.217116+0.202832   test-rmse:47.628777+0.547529 
## [386]    train-rmse:47.189932+0.202650   test-rmse:47.599563+0.548529 
## [387]    train-rmse:47.173037+0.205530   test-rmse:47.586651+0.555150 
## [388]    train-rmse:47.157772+0.188195   test-rmse:47.572461+0.550594 
## [389]    train-rmse:47.131156+0.171552   test-rmse:47.547495+0.530978 
## [390]    train-rmse:47.117274+0.161894   test-rmse:47.531893+0.524464 
## [391]    train-rmse:47.099124+0.158325   test-rmse:47.518009+0.531957 
## [392]    train-rmse:47.074843+0.164808   test-rmse:47.495008+0.526256 
## [393]    train-rmse:47.041917+0.171408   test-rmse:47.460796+0.527938 
## [394]    train-rmse:47.015490+0.177358   test-rmse:47.435241+0.526948 
## [395]    train-rmse:46.978819+0.162740   test-rmse:47.398736+0.526761 
## [396]    train-rmse:46.964785+0.151914   test-rmse:47.388799+0.520883 
## [397]    train-rmse:46.950314+0.149281   test-rmse:47.373952+0.514508 
## [398]    train-rmse:46.938541+0.153519   test-rmse:47.362632+0.527526 
## [399]    train-rmse:46.923992+0.154339   test-rmse:47.349711+0.538247 
## [400]    train-rmse:46.905430+0.159599   test-rmse:47.333259+0.549625 
## [401]    train-rmse:46.872646+0.142053   test-rmse:47.300134+0.536052 
## [402]    train-rmse:46.839009+0.137347   test-rmse:47.269876+0.538512 
## [403]    train-rmse:46.825924+0.140722   test-rmse:47.257595+0.540718 
## [404]    train-rmse:46.817986+0.142703   test-rmse:47.250834+0.537692 
## [405]    train-rmse:46.804420+0.146190   test-rmse:47.239553+0.532900 
## [406]    train-rmse:46.787158+0.151135   test-rmse:47.220282+0.522133 
## [407]    train-rmse:46.780307+0.153566   test-rmse:47.213490+0.524579 
## [408]    train-rmse:46.752274+0.144714   test-rmse:47.183689+0.513214 
## [409]    train-rmse:46.727256+0.141199   test-rmse:47.162344+0.508690 
## [410]    train-rmse:46.685495+0.135166   test-rmse:47.123695+0.514060 
## [411]    train-rmse:46.664700+0.141149   test-rmse:47.101035+0.514550 
## [412]    train-rmse:46.630422+0.127325   test-rmse:47.070627+0.499000 
## [413]    train-rmse:46.610133+0.131111   test-rmse:47.050126+0.490990 
## [414]    train-rmse:46.602250+0.134647   test-rmse:47.043330+0.492376 
## [415]    train-rmse:46.590089+0.139870   test-rmse:47.032632+0.487568 
## [416]    train-rmse:46.575823+0.144982   test-rmse:47.020403+0.487019 
## [417]    train-rmse:46.559308+0.151891   test-rmse:47.001225+0.497517 
## [418]    train-rmse:46.527756+0.165036   test-rmse:46.971448+0.512566 
## [419]    train-rmse:46.520385+0.163534   test-rmse:46.963651+0.510733 
## [420]    train-rmse:46.494781+0.153782   test-rmse:46.939815+0.499866 
## [421]    train-rmse:46.473816+0.141132   test-rmse:46.919823+0.499814 
## [422]    train-rmse:46.449073+0.131635   test-rmse:46.896594+0.496520 
## [423]    train-rmse:46.443901+0.133464   test-rmse:46.893488+0.494243 
## [424]    train-rmse:46.434006+0.132869   test-rmse:46.882677+0.489194 
## [425]    train-rmse:46.402466+0.145608   test-rmse:46.848725+0.498665 
## [426]    train-rmse:46.382356+0.132569   test-rmse:46.828101+0.500320 
## [427]    train-rmse:46.369642+0.135656   test-rmse:46.816460+0.511416 
## [428]    train-rmse:46.347415+0.128682   test-rmse:46.795274+0.517803 
## [429]    train-rmse:46.328780+0.134665   test-rmse:46.775673+0.509930 
## [430]    train-rmse:46.302149+0.133437   test-rmse:46.748001+0.508469 
## [431]    train-rmse:46.274665+0.147843   test-rmse:46.722698+0.509686 
## [432]    train-rmse:46.261323+0.151808   test-rmse:46.709088+0.509848 
## [433]    train-rmse:46.249735+0.158662   test-rmse:46.698956+0.507027 
## [434]    train-rmse:46.246740+0.157501   test-rmse:46.695615+0.507617 
## [435]    train-rmse:46.233844+0.154388   test-rmse:46.683808+0.495762 
## [436]    train-rmse:46.228359+0.149636   test-rmse:46.679462+0.494084 
## [437]    train-rmse:46.214059+0.143026   test-rmse:46.664547+0.492178 
## [438]    train-rmse:46.206525+0.144458   test-rmse:46.658119+0.492648 
## [439]    train-rmse:46.187209+0.153612   test-rmse:46.639668+0.485821 
## [440]    train-rmse:46.161612+0.167319   test-rmse:46.616022+0.495188 
## [441]    train-rmse:46.134639+0.167320   test-rmse:46.590932+0.511245 
## [442]    train-rmse:46.109484+0.177292   test-rmse:46.567701+0.512079 
## [443]    train-rmse:46.085227+0.177066   test-rmse:46.546091+0.519862 
## [444]    train-rmse:46.060860+0.169557   test-rmse:46.523013+0.516550 
## [445]    train-rmse:46.044945+0.171303   test-rmse:46.506530+0.507326 
## [446]    train-rmse:46.036119+0.173496   test-rmse:46.497330+0.514307 
## [447]    train-rmse:46.013606+0.182008   test-rmse:46.473806+0.517652 
## [448]    train-rmse:46.005001+0.174053   test-rmse:46.466906+0.516801 
## [449]    train-rmse:45.991470+0.159449   test-rmse:46.453999+0.515204 
## [450]    train-rmse:45.977358+0.159128   test-rmse:46.441499+0.514711 
## [451]    train-rmse:45.958541+0.169133   test-rmse:46.419887+0.510390 
## [452]    train-rmse:45.940621+0.174791   test-rmse:46.402671+0.508032 
## [453]    train-rmse:45.917406+0.158332   test-rmse:46.383033+0.513019 
## [454]    train-rmse:45.908152+0.155719   test-rmse:46.374632+0.513436 
## [455]    train-rmse:45.888639+0.148349   test-rmse:46.358321+0.513723 
## [456]    train-rmse:45.879127+0.144152   test-rmse:46.349227+0.515903 
## [457]    train-rmse:45.860329+0.136885   test-rmse:46.332120+0.518566 
## [458]    train-rmse:45.828136+0.148722   test-rmse:46.302358+0.534762 
## [459]    train-rmse:45.814710+0.148782   test-rmse:46.292790+0.539094 
## [460]    train-rmse:45.810117+0.148493   test-rmse:46.288121+0.540062 
## [461]    train-rmse:45.796974+0.150638   test-rmse:46.277497+0.544760 
## [462]    train-rmse:45.783106+0.159720   test-rmse:46.262244+0.545237 
## [463]    train-rmse:45.767663+0.162771   test-rmse:46.246498+0.546676 
## [464]    train-rmse:45.749870+0.164724   test-rmse:46.229823+0.539200 
## [465]    train-rmse:45.732860+0.169222   test-rmse:46.215170+0.546710 
## [466]    train-rmse:45.720686+0.174869   test-rmse:46.200266+0.561913 
## [467]    train-rmse:45.704133+0.165141   test-rmse:46.187659+0.564464 
## [468]    train-rmse:45.694306+0.168043   test-rmse:46.178003+0.566280 
## [469]    train-rmse:45.686076+0.170003   test-rmse:46.169870+0.568332 
## [470]    train-rmse:45.671399+0.167197   test-rmse:46.157477+0.567083 
## [471]    train-rmse:45.636745+0.172462   test-rmse:46.126946+0.581677 
## [472]    train-rmse:45.621850+0.171584   test-rmse:46.110007+0.569973 
## [473]    train-rmse:45.593334+0.168309   test-rmse:46.080480+0.587886 
## [474]    train-rmse:45.584219+0.167995   test-rmse:46.070905+0.591608 
## [475]    train-rmse:45.571954+0.167452   test-rmse:46.061178+0.594360 
## [476]    train-rmse:45.550243+0.179039   test-rmse:46.041114+0.614364 
## [477]    train-rmse:45.537283+0.172300   test-rmse:46.027288+0.610681 
## [478]    train-rmse:45.513496+0.175295   test-rmse:46.003350+0.612629 
## [479]    train-rmse:45.490511+0.161270   test-rmse:45.977981+0.593276 
## [480]    train-rmse:45.475863+0.164596   test-rmse:45.963698+0.596320 
## [481]    train-rmse:45.456282+0.166386   test-rmse:45.946568+0.597153 
## [482]    train-rmse:45.446410+0.162893   test-rmse:45.936001+0.588190 
## [483]    train-rmse:45.435695+0.153688   test-rmse:45.926766+0.584428 
## [484]    train-rmse:45.430102+0.154594   test-rmse:45.921065+0.581302 
## [485]    train-rmse:45.414578+0.157382   test-rmse:45.906327+0.586862 
## [486]    train-rmse:45.396398+0.151450   test-rmse:45.888352+0.576171 
## [487]    train-rmse:45.385769+0.150543   test-rmse:45.877299+0.569684 
## [488]    train-rmse:45.382031+0.151747   test-rmse:45.875248+0.570096 
## [489]    train-rmse:45.369416+0.158593   test-rmse:45.863530+0.577850 
## [490]    train-rmse:45.366287+0.157468   test-rmse:45.861518+0.578351 
## [491]    train-rmse:45.350783+0.153858   test-rmse:45.847524+0.574549 
## [492]    train-rmse:45.334451+0.143255   test-rmse:45.830926+0.564263 
## [493]    train-rmse:45.319819+0.145327   test-rmse:45.816966+0.569166 
## [494]    train-rmse:45.298932+0.139875   test-rmse:45.796420+0.571776 
## [495]    train-rmse:45.286228+0.142201   test-rmse:45.782812+0.566775 
## [496]    train-rmse:45.270842+0.138614   test-rmse:45.768091+0.567127 
## [497]    train-rmse:45.260438+0.141958   test-rmse:45.758467+0.568403 
## [498]    train-rmse:45.237271+0.144656   test-rmse:45.739698+0.565418 
## [499]    train-rmse:45.217601+0.141640   test-rmse:45.722453+0.573798 
## [500]    train-rmse:45.205877+0.146292   test-rmse:45.713568+0.577943
best_nrounds <- cv_results$best_iteration
# Train the final model using the best number of rounds found
model_xgb <- xgb.train(
  params = params,
  data = train_dmatrix,
  nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb, train_dmatrix)
test_pred <- predict(model_xgb, test_dmatrix)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)

# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)

train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
cat("Model Performance Metrics:\n",
    "--------------------------\n",
    "Training RMSE: ", train_rmse, "\n",
    "Test RMSE: ", test_rmse, "\n",
    "Training R-squared: ", r_squared_train, "\n",
    "Test R-squared: ", r_squared_test, "\n",
    "Training MAE: ", train_mae, "\n",
    "Test MAE: ", test_mae, "\n",
    "Training MAPE: ", train_mape, "%\n",
    "Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 44.82021
## Test RMSE: 44.16044
## Training R-squared: 0.5532223
## Test R-squared: 0.554944
## Training MAE: 28.18775
## Test MAE: 27.78375
## Training MAPE: 230.3977%
## Test MAPE: 227.1307%
# Calculate feature importance
importance_matrix2 <- xgb.importance(feature_names = colnames(train_features), model = model_xgb)

# View the feature importance scores
print(importance_matrix2)
##                          Feature        Gain       Cover   Frequency
##                           <char>       <num>       <num>       <num>
##  1:                     POP_SQMI 0.340931387 0.393861289 0.336921734
##  2:            BEAUTIFUL.GREENER 0.116209505 0.036435002 0.062263602
##  3:                        SOCAL 0.077321094 0.033451523 0.035205121
##  4:           WEEKS_SINCE_LAUNCH 0.072864529 0.144839771 0.138201920
##  5:           X12SMALL.12ONE.CUP 0.057392785 0.083301611 0.054989817
##  6: HILL.MOISTURE.THRASHED.APPLE 0.057310016 0.016375429 0.058481234
##  7:                     NORTHERN 0.044424316 0.019071063 0.025894676
##  8:      BEAUTIFUL.GREENER..PLUM 0.033698503 0.011734788 0.023276113
##  9:                 SINGLE.GROUP 0.031417764 0.019404404 0.018329939
## 10:                     MOUNTAIN 0.027541042 0.042845101 0.029386093
## 11:                       KANSAS 0.025484787 0.024550632 0.018620890
## 12:                      ARIZONA 0.021486319 0.014530069 0.025312773
## 13:           X12SMALL.24ONE.CUP 0.015837330 0.006719971 0.026476578
## 14:       RAINING..THRASHED.PLUM 0.013351363 0.005598469 0.016002328
## 15:                ZIZZLES..PLUM 0.012424925 0.009293321 0.008437591
## 16:           X12SMALL.18ONE.CUP 0.010776187 0.007169854 0.009601397
## 17:                  CALI_NEVADA 0.007987276 0.022850582 0.014547571
## 18:                    DESERT_SW 0.006845436 0.007852210 0.015420425
## 19:                    NEWMEXICO 0.006266702 0.012127745 0.009019494
## 20:                     COLORADO 0.006081127 0.033611868 0.019784696
## 21:                      PRAIRIE 0.005389787 0.017191027 0.015711376
## 22:                        NOCAL 0.004704506 0.025138140 0.018620890
## 23:            X12SMALL.6ONE.CUP 0.002718068 0.004951345 0.013383765
## 24:           X12SMALL.20ONE.CUP 0.001535244 0.007094786 0.006109980
##                          Feature        Gain       Cover   Frequency
xgb.plot.importance(importance_matrix = importance_matrix2)

>

# Define vectors for each category
str(one_hot_plum)
## tibble [43,366 × 37] (S3: tbl_df/tbl/data.frame)
##  $ DATE                        : Date[1:43366], format: "2022-06-18" "2022-04-30" ...
##  $ UNIT_SALES                  : num [1:43366] 1 14 18 13 19 4 29 35 75 25 ...
##  $ DOLLAR_SALES                : num [1:43366] 4.62 86.86 89.73 65.6 72.93 ...
##  $ POP_SQMI                    : num [1:43366] 1.2 1.2 1.2 1.2 1.2 ...
##  $ MONTH                       : num [1:43366] 6 4 12 7 11 6 1 11 10 12 ...
##  $ SEASON                      : chr [1:43366] "SUMMER" "SPRING" "WINTER" "SUMMER" ...
##  $ PACKAGE2                    : chr [1:43366] "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
##  $ ENERGY_DRINK                : num [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ CALORIC_SEGMENT_TEXT        : chr [1:43366] NA NA NA NA ...
##  $ min_launch_date             : Date[1:43366], format: "2021-09-04" "2021-09-04" ...
##  $ WEEKS_SINCE_LAUNCH          : num [1:43366] 41 34 14 47 12 40 20 9 6 13 ...
##  $ NORTHERN                    : int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
##  $ CALI_NEVADA                 : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ DESERT_SW                   : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MOUNTAIN                    : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SOCAL                       : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PRAIRIE                     : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ARIZONA                     : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ NEWMEXICO                   : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ NOCAL                       : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ COLORADO                    : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ KANSAS                      : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ HILL MOISTURE THRASHED APPLE: int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
##  $ BEAUTIFUL GREENER           : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SINGLE GROUP                : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ RAINING  THRASHED PLUM      : int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
##  $ BEAUTIFUL GREENER  PLUM     : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ZIZZLES  PLUM               : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 12ONE CUP           : int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
##  $ 12SMALL 6ONE CUP            : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 24ONE CUP           : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 20ONE CUP           : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 18ONE CUP           : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SUMMER                      : int [1:43366] 1 0 0 1 0 1 0 0 0 0 ...
##  $ SPRING                      : int [1:43366] 0 1 0 0 0 0 0 0 0 0 ...
##  $ WINTER                      : int [1:43366] 0 0 1 0 0 0 1 0 0 1 ...
##  $ FALL                        : int [1:43366] 0 0 0 0 1 0 0 1 1 0 ...
# List to store unique values for each variable

regions <- 1:11
brands <- 1:3
items <- 1:3
package_options <- 1:5

# Create data frame with all combinations of categories
combinations <- expand.grid(Region = regions, Brand = brands, Item = items, Package = package_options)

# Duplicate each combination 52 times to represent each week of the year
final_df_replicated <- combinations[rep(row.names(combinations), each = 52), ]

# Add a column with values from 1 to 52 for each combination
final_df_replicated$Week_of_Year <- rep(1:52, times = nrow(combinations))

# Duplicate each combination 52 times to represent each week of the year
final_df_replicated <- final_df_replicated[rep(row.names(final_df_replicated), each = 13), ]

# Add a column with values from 1 to 13 for each combination
final_df_replicated$Week_Since_Launch <- rep(1:13, times = nrow(combinations))

final_df_replicated$Region <- unique_values_list$REGION[final_df_replicated$Region]
final_df_replicated$Brand <- unique_values_list$BRAND[final_df_replicated$Brand]
final_df_replicated$Item <- unique_values_list$ITEM[final_df_replicated$Item]
final_df_replicated$Package <- unique_values_list$PACKAGE[final_df_replicated$Package]

# List to store unique values for each variable
new_unique_values_list <- list()

# Columns to get unique values for
new_columns_to_get_unique_values <- c("Region", "Brand", "Item", "Package")

# Get unique values for each variable and store in the list
for (col in new_columns_to_get_unique_values) {
  new_unique_values_list[[col]] <- unique(final_df_replicated[[col]])
}

# Loop over unique regions and create new columns
for (Region in new_unique_values_list$Region) {
  final_df_replicated[[Region]] <- as.integer(final_df_replicated$Region == Region)
}

# Loop over unique regions and create new columns
for (Brand in new_unique_values_list$Brand) {
  final_df_replicated[[Brand]] <- as.integer(final_df_replicated$Brand == Brand)
}

# Loop over unique regions and create new columns
for (Item in new_unique_values_list$Item) {
  final_df_replicated[[Item]] <- as.integer(final_df_replicated$Item == Item)
}

# Loop over unique regions and create new columns
for (Package in new_unique_values_list$Package) {
  final_df_replicated[[Package]] <- as.integer(final_df_replicated$Package == Package)
}

#Create dummy_data and remove non one hot encoded data
dummy_data <- final_df_replicated %>%
  select(-Region, -Brand, -Item, -Package)

#add a Unit sales column
dummy_data$UNIT_SALES <- NA
dummy_data$UNIT_SALES <- as.numeric(dummy_data$UNIT_SALES)

str(final_df_replicated)
## 'data.frame':    334620 obs. of  28 variables:
##  $ Region                      : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ Brand                       : chr  "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
##  $ Item                        : chr  "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" "RAINING  THRASHED PLUM" ...
##  $ Package                     : chr  "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
##  $ Week_of_Year                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Week_Since_Launch           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ NORTHERN                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ CALI_NEVADA                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DESERT_SW                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MOUNTAIN                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SOCAL                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRAIRIE                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ARIZONA                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NEWMEXICO                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOCAL                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ COLORADO                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ KANSAS                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HILL MOISTURE THRASHED APPLE: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BEAUTIFUL GREENER           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SINGLE GROUP                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RAINING  THRASHED PLUM      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BEAUTIFUL GREENER  PLUM     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ZIZZLES  PLUM               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 12ONE CUP           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ 12SMALL 6ONE CUP            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 24ONE CUP           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 20ONE CUP           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 18ONE CUP           : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:4] 11 3 3 5
##   .. ..- attr(*, "names")= chr [1:4] "Region" "Brand" "Item" "Package"
##   ..$ dimnames:List of 4
##   .. ..$ Region : chr [1:11] "Region= 1" "Region= 2" "Region= 3" "Region= 4" ...
##   .. ..$ Brand  : chr [1:3] "Brand=1" "Brand=2" "Brand=3"
##   .. ..$ Item   : chr [1:3] "Item=1" "Item=2" "Item=3"
##   .. ..$ Package: chr [1:5] "Package=1" "Package=2" "Package=3" "Package=4" ...
str(dummy_data)
## 'data.frame':    334620 obs. of  25 variables:
##  $ Week_of_Year                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Week_Since_Launch           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ NORTHERN                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ CALI_NEVADA                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DESERT_SW                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MOUNTAIN                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SOCAL                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRAIRIE                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ARIZONA                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NEWMEXICO                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOCAL                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ COLORADO                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ KANSAS                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HILL MOISTURE THRASHED APPLE: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BEAUTIFUL GREENER           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SINGLE GROUP                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RAINING  THRASHED PLUM      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BEAUTIFUL GREENER  PLUM     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ZIZZLES  PLUM               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 12ONE CUP           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ 12SMALL 6ONE CUP            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 24ONE CUP           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 20ONE CUP           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 12SMALL 18ONE CUP           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ UNIT_SALES                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:4] 11 3 3 5
##   .. ..- attr(*, "names")= chr [1:4] "Region" "Brand" "Item" "Package"
##   ..$ dimnames:List of 4
##   .. ..$ Region : chr [1:11] "Region= 1" "Region= 2" "Region= 3" "Region= 4" ...
##   .. ..$ Brand  : chr [1:3] "Brand=1" "Brand=2" "Brand=3"
##   .. ..$ Item   : chr [1:3] "Item=1" "Item=2" "Item=3"
##   .. ..$ Package: chr [1:5] "Package=1" "Package=2" "Package=3" "Package=4" ...
#rename columes to match original features

dummy_data <- dummy_data %>%
  rename(
    `BEAUTIFUL.GREENER` = `BEAUTIFUL GREENER`,   
    `X12SMALL.12ONE.CUP` = `12SMALL 12ONE CUP`,                                            `HILL.MOISTURE.THRASHED.APPLE` = `HILL MOISTURE THRASHED APPLE`,
    `BEAUTIFUL.GREENER..PLUM` = `BEAUTIFUL GREENER  PLUM`,
    `X12SMALL.24ONE.CUP` = `12SMALL 24ONE CUP`,
    `X12SMALL.6ONE.CUP` = `12SMALL 6ONE CUP`,
    `X12SMALL.20ONE.CUP` = `12SMALL 20ONE CUP`,
    `RAINING..THRASHED.PLUM`=`RAINING  THRASHED PLUM`,
    `X12SMALL.18ONE.CUP`=`12SMALL 18ONE CUP`,
    
  )

# Check for Matching Features
#Get the column names of Test and dummy_data
names_Test <- names(Test)
names_dummy_data <- names(dummy_data)

# Find the matching column names
matching_names <- intersect(names_Test, names_dummy_data)

# Find the non-matching column names
non_matching_names_Test <- setdiff(names_Test, matching_names)
non_matching_names_dummy_data <- setdiff(names_dummy_data, matching_names)

#Print the matching and non-matching column names
cat("Matching column names:", paste(matching_names, collapse = ", "), "\n")
## Matching column names: UNIT_SALES, NORTHERN, CALI_NEVADA, DESERT_SW, MOUNTAIN, SOCAL, PRAIRIE, ARIZONA, NEWMEXICO, NOCAL, COLORADO, KANSAS, HILL.MOISTURE.THRASHED.APPLE, BEAUTIFUL.GREENER, RAINING..THRASHED.PLUM, BEAUTIFUL.GREENER..PLUM, X12SMALL.12ONE.CUP, X12SMALL.6ONE.CUP, X12SMALL.24ONE.CUP, X12SMALL.20ONE.CUP, X12SMALL.18ONE.CUP
cat("Non-matching column names in Test:", paste(non_matching_names_Test, collapse = ", "), "\n")
## Non-matching column names in Test: POP_SQMI, PACKAGE2, ENERGY_DRINK, CALORIC_SEGMENT_TEXT, min_launch_date, WEEKS_SINCE_LAUNCH, SINGLE.GROUP, ZIZZLES..PLUM
cat("Non-matching column names in dummy_data:", paste(non_matching_names_dummy_data, collapse = ", "), "\n")
## Non-matching column names in dummy_data: Week_of_Year, Week_Since_Launch, SINGLE GROUP, ZIZZLES  PLUM
# Adding the non-matching columns to dummy_data with default values
for (col in non_matching_names_Test) {
  dummy_data[[col]] <- NA  # You can change NA to any default value you prefer
}


# Get the column names of the Test dataframe
test_colnames <- colnames(Test)

# Reorder columns of dummy_data to match the order of columns in Test
dummy_data <- dummy_data %>%
  select(all_of(test_colnames))

# Prepare features for XGBoost
dummy_features <- dummy_data[, -which(names(dummy_data) == "UNIT_SALES")]


# Convert data to DMatrix format

dummy_dmatrix<- xgb.DMatrix(data = as.matrix(dummy_features))

dummy_pred <- predict(model_xgb, dummy_dmatrix)

# Add the predictions to dummy_data
dummy_data$Predictions <- dummy_pred

# Convert predictions to integers
dummy_data$Predictions <- round(dummy_pred)

# Convert to integer data type
dummy_data$Predictions <- as.integer(dummy_data$Predictions)

# Apply post-processing to ensure non-negative predictions
dummy_pred <- predict(model_xgb, dummy_dmatrix)

# Set negative predictions to zero
dummy_pred <- pmax(dummy_pred, 0)

# Add the adjusted predictions to dummy_data
dummy_data$Predictions <- dummy_pred

summary(dummy_data$Predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   6.803  22.251  35.879 154.412
ggplot(dummy_data, aes(x = Predictions)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Density Plot of Predicted Values",
       x = "Predicted Values",
       y = "Density")

plum %>%
  summarize(n = n(),
            AVG_UNIT_SALES = mean(UNIT_SALES))
## # A tibble: 1 × 2
##       n AVG_UNIT_SALES
##   <int>          <dbl>
## 1 43366           55.9
# Prediction Results
summary(dummy_data$Predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   6.803  22.251  35.879 154.412
dummy_data %>%
  summarize(n = n(),
            AVG_UNIT_SALES = mean(Predictions))
##        n AVG_UNIT_SALES
## 1 334620       22.25126
# #stop cluster
# doParallel::stopImplicitCluster()
#cleanup all obj
rm(list = ls())

Dummy data for unseen innovation products does not provide the most confident results.

#Reference Q information for this flava flav > Item Description: Diet Energy Moonlit Casava 2L Multi Jug Caloric Segment: Diet Market Category: Energy Manufacturer: Swire-CC Brand: Diet Moonlit Package Type: 2L Multi Jug Flavor: ‘Cassava’ Swire plans to release this product for 6 months. What will the forecasted demand be, in weeks, for this product?

Taking a sample of the whole dataset

df <- readRDS("swire_no_nas.rds")  #inject the data and we will sub-sample

Quick imputations

# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
  mutate(
    MONTH = as.numeric(substr(DATE, 6, 7)),  # Extract the month from YYYY-MM-DD format
    SEASON = case_when(
      MONTH %in% c(12, 01, 02) ~ "WINTER",
      MONTH %in% c(03, 04, 05) ~ "SPRING",
      MONTH %in% c(06, 07, 08) ~ "SUMMER",
      MONTH %in% c(09, 10, 11) ~ "FALL",
      TRUE ~ NA_character_  # This is just in case there are any undefined values
    )
  )



regions_joinme <- read.csv("states_summary.csv")

unique(regions_joinme$REGION)
##  [1] "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA" "MOUNTAIN"   
##  [6] "SOCAL"       "ARIZONA"     "NEWMEXICO"   "NOCAL"       "COLORADO"   
## [11] "KANSAS"
# "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA"  "MOUNTAIN"    "SOCAL"   "ARIZONA"    "NEWMEXICO"   "NOCAL"    "COLORADO"    "KANSAS" 

str(regions_joinme)
## 'data.frame':    200 obs. of  2 variables:
##  $ MARKET_KEY: int  13 70 179 197 272 352 32 33 44 50 ...
##  $ REGION    : chr  "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
str(df)
## 'data.frame':    24461424 obs. of  13 variables:
##  $ MARKET_KEY     : chr  "1" "1" "1" "1" ...
##  $ DATE           : chr  "2021-10-16" "2022-06-04" "2022-02-05" "2022-10-08" ...
##  $ CALORIC_SEGMENT: num  0 0 1 0 0 1 0 0 1 0 ...
##  $ CATEGORY       : chr  "ENERGY" "SSD" "SSD" "SSD" ...
##  $ UNIT_SALES     : num  434 28 42 1 26 161 6 5 68 90 ...
##  $ DOLLAR_SALES   : num  924.04 147.77 25.13 0.99 94.56 ...
##  $ MANUFACTURER   : chr  "PONYS" "SWIRE-CC" "COCOS" "JOLLYS" ...
##  $ BRAND          : chr  "MYTHICAL BEVERAGE ULTRA" "DIET PEPPY CF" "HANSENIZZLE'S ECO" "DIET PAPI" ...
##  $ PACKAGE        : chr  "16SMALL MULTI CUP" "12SMALL 12ONE CUP" "12SMALL 6ONE CUP" "12SMALL 6ONE CUP" ...
##  $ ITEM           : chr  "MYTHICAL BEVERAGE ULTRA SUNRISE ENERGY DRINK UNFLAVORED ZERO SUGAR CUP 16 LIQUID SMALL" "DIET PEPPY CAFFEINE FREE GENTLE DRINK RED  PEPPER COLA DIET CUP 12 LIQUID SMALL X12" "HANSENIZZLE'S ECO GENTLE DRINK MANDARIN DURIAN  CUP 12 LIQUID SMALL" "DIET PAPI GENTLE DRINK COLA DIET CUP 12 LIQUID SMALL" ...
##  $ MONTH          : num  10 6 2 10 7 9 9 6 10 5 ...
##  $ SEASON         : chr  "FALL" "SUMMER" "WINTER" "FALL" ...
##  $ REGION         : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...

Making a 10% sample of the data to shrink it

# Assuming df is your dataframe
set.seed(123) # Set a random seed for reproducibility
sampled_df <- df[sample(1:nrow(df), 2446143), ]
rm(df)
df <- sampled_df
rm(sampled_df)
#skim(df)
summary(df)
##   MARKET_KEY            DATE           CALORIC_SEGMENT    CATEGORY        
##  Length:2446143     Length:2446143     Min.   :0.0000   Length:2446143    
##  Class :character   Class :character   1st Qu.:0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :1.0000   Mode  :character  
##                                        Mean   :0.5024                     
##                                        3rd Qu.:1.0000                     
##                                        Max.   :1.0000                     
##    UNIT_SALES        DOLLAR_SALES      MANUFACTURER          BRAND          
##  Min.   :    0.04   Min.   :     0.0   Length:2446143     Length:2446143    
##  1st Qu.:   11.00   1st Qu.:    36.5   Class :character   Class :character  
##  Median :   40.00   Median :   134.9   Mode  :character   Mode  :character  
##  Mean   :  173.87   Mean   :   590.0                                        
##  3rd Qu.:  126.00   3rd Qu.:   427.3                                        
##  Max.   :92448.00   Max.   :392062.7                                        
##    PACKAGE              ITEM               MONTH           SEASON         
##  Length:2446143     Length:2446143     Min.   : 1.000   Length:2446143    
##  Class :character   Class :character   1st Qu.: 3.000   Class :character  
##  Mode  :character   Mode  :character   Median : 6.000   Mode  :character  
##                                        Mean   : 6.287                     
##                                        3rd Qu.: 9.000                     
##                                        Max.   :12.000                     
##     REGION         
##  Length:2446143    
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Linear model on sampled data looks the same largely

# Perform a linear regression with UNIT_SALES as the dependent variable
# and PRICE (or your chosen variable) as the independent variable
linear_model <- lm(DOLLAR_SALES ~ UNIT_SALES, data = df)

# Print the summary of the linear model to see the results
summary(linear_model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107746    -112     -60       3  239824 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.563703   1.050071   57.68   <2e-16 ***
## UNIT_SALES   3.045165   0.001204 2528.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1609 on 2446141 degrees of freedom
## Multiple R-squared:  0.7232, Adjusted R-squared:  0.7232 
## F-statistic: 6.392e+06 on 1 and 2446141 DF,  p-value: < 2.2e-16
# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(df, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
  geom_point(alpha = 0.5) +  # Adjust alpha to avoid overplotting, if necessary
  geom_smooth(method = "lm", color = "black", se = FALSE) +  # Add linear regression line without confidence band for clarity
  labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
       x = "UNIT SALES",
       y = "DOLLAR SALES") +
  theme_minimal() +
  theme(legend.position = "bottom")  # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'

Taking a look at Diet Moonlit brand..

# create a table of total values by brand
brand_summary <- df %>%
  group_by(BRAND) %>%
  summarise(
    total_units_sold = sum(UNIT_SALES),
    total_revenue = sum(DOLLAR_SALES),
    avg_price = total_revenue / total_units_sold,
    total_days_sold = n() # Count the number of rows for each brand
  ) %>%
  arrange(desc(total_revenue)) %>%  # Order by revenue in descending order
  mutate(rank = row_number()) 

summary(brand_summary)
##     BRAND           total_units_sold   total_revenue         avg_price      
##  Length:286         Min.   :       1   Min.   :        1   Min.   : 0.1658  
##  Class :character   1st Qu.:    2575   1st Qu.:     8778   1st Qu.: 2.0501  
##  Mode  :character   Median :   98109   Median :   296741   Median : 2.9876  
##                     Mean   : 1487092   Mean   :  5046437   Mean   : 3.2334  
##                     3rd Qu.:  669287   3rd Qu.:  2159509   3rd Qu.: 3.7412  
##                     Max.   :41531681   Max.   :164499278   Max.   :42.9411  
##  total_days_sold         rank       
##  Min.   :     1.0   Min.   :  1.00  
##  1st Qu.:   134.8   1st Qu.: 72.25  
##  Median :  2116.0   Median :143.50  
##  Mean   :  8553.0   Mean   :143.50  
##  3rd Qu.:  8275.0   3rd Qu.:214.75  
##  Max.   :123829.0   Max.   :286.00
print(brand_summary[brand_summary$BRAND == "DIET MOONLIT", ])
## # A tibble: 1 × 6
##   BRAND        total_units_sold total_revenue avg_price total_days_sold  rank
##   <chr>                   <dbl>         <dbl>     <dbl>           <int> <int>
## 1 DIET MOONLIT          747385.      2502014.      3.35            7587    70

Diet Moonlit is a rising star ranking 69 out of 288 brands in terms of total revenue, with an average price of $3.50 slightly above the overall mean of $3.27.

# Filter the dataframe for only 'DIET SMASH'
filtered_df <- df %>% 
  filter(BRAND == "DIET MOONLIT")

# Create the plot
ggplot(filtered_df, aes(x = UNIT_SALES, y = DOLLAR_SALES)) +
  geom_point(color = "red", alpha = 1) +  # Bright red points with full opacity
  geom_smooth(method = "lm", color = "black", se = FALSE) +  # Add linear regression line without confidence band
  labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES for DIET SMASH",
       x = "UNIT SALES",
       y = "DOLLAR SALES") +
  theme_minimal() +
  theme(legend.position = "none")  
## `geom_smooth()` using formula = 'y ~ x'

DIET MOONLIT has a tight cluster below 1,000 unit sales and $2,500 revenue, but there are some remarkable high fliers nearing $20,000 and just over 3000 units.

Sales by Week of the Year

filtered_df %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  ggplot(aes(x = WEEK, y = total_sales)) +
  geom_line(color = "black") +  # Blue line connecting points
  labs(title = "Total Sales by Week of the Year",
       x = "Week of the Year",
       y = "Total Unit Sales") +
  theme_minimal()

> DIET MOONLIT shows many peaks and valleys in sales by week.

library(zoo)
# Calculate total sales for each group of 211 consecutive weeks (6 months)
sales_by_group <- filtered_df %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  mutate(sales_in_group = rollsum(total_sales, 21, align = "left", fill = NA)) %>%
  mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 21)) %>%
  arrange(WEEK) %>%  # Order by WEEK
  filter(!is.na(sales_in_group))  # Remove rows with sales_in_group = NA

# Plot the bar chart
sales_by_group$week_label <- factor(sales_by_group$week_label, levels = sales_by_group$week_label[order(sales_by_group$WEEK)])
ggplot(sales_by_group, aes(x = factor(week_label), y = sales_in_group)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Total Sales for Each 6-month Grouping",
       x = "Weeks (Starting from Week 1)",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

> DIET MOONLIT has it’s best 6 month runs week 7 - 27 historically.

#find the best 21 weeks for Casava sales
# Calculate total sales for each group of 21 consecutive weeks
sales_by_casava <- df %>%
  filter(str_detect(ITEM, "CASAVA")) %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  mutate(sales_in_group = rollsum(total_sales, 21, align = "left", fill = NA)) %>%
  mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 21)) %>%
  arrange(WEEK) %>%  # Order by WEEK
  filter(!is.na(sales_in_group))  # Remove rows with sales_in_group = NA

# Plot the bar chart
sales_by_casava$week_label <- factor(sales_by_casava$week_label, levels = sales_by_casava$week_label[order(sales_by_casava$WEEK)])
ggplot(sales_by_casava, aes(x = factor(week_label), y = sales_in_group)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Total Sales for Each 21-Week Grouping",
       x = "Weeks (Starting from Week 1)",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

> Casava sales are best in the 21 weeks from week 14 to 34.

#find the best 21 weeks for casava, energy, diet, s
# Calculate total sales for each group of 21 consecutive weeks

sales_by_innovation <- df %>%
  filter(CATEGORY == "ENERGY",
         str_detect(ITEM, "CASAVA")) %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
  group_by(WEEK) %>%
  summarise(total_sales = sum(UNIT_SALES)) %>%
  mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
  mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
  arrange(WEEK) %>%  # Order by WEEK
  filter(!is.na(sales_in_group))  # Remove rows with sales_in_group = NA

# Plot the bar chart
sales_by_innovation$week_label <- factor(sales_by_innovation$week_label, levels = sales_by_innovation$week_label[order(sales_by_innovation$WEEK)])
ggplot(sales_by_innovation, aes(x = factor(week_label), y = sales_in_group)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Total Sales for Each 13-Week Grouping",
       x = "Weeks (Starting from Week 1)",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

Make a new smaller “innovation” data frame

#create innovation based on Energy, Casava
innovation<- df %>%
  filter(CATEGORY == "ENERGY",
         str_detect(ITEM, "CASAVA"))


#unique PACKAGE string from innovation
print(unique(innovation$PACKAGE))
## [1] "16SMALL MULTI CUP" "16SMALL 24ONE CUP"
library(dplyr)
library(lubridate)

innovation <- innovation %>%
  mutate(
    MONTH = month(ymd(DATE)),  # Extract month using lubridate's ymd function
    MONTH = as.factor(MONTH)   # Convert the extracted month into a factor
  )

str(innovation)
## 'data.frame':    5045 obs. of  13 variables:
##  $ MARKET_KEY     : chr  "59" "133" "965" "303" ...
##  $ DATE           : chr  "2022-07-23" "2023-01-28" "2022-02-12" "2022-10-22" ...
##  $ CALORIC_SEGMENT: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY       : chr  "ENERGY" "ENERGY" "ENERGY" "ENERGY" ...
##  $ UNIT_SALES     : num  1 55 186 123 159 3 54 17 102 58 ...
##  $ DOLLAR_SALES   : num  2.49 145.36 417.21 210.09 407.14 ...
##  $ MANUFACTURER   : chr  "JOLLYS" "PONYS" "PONYS" "JOLLYS" ...
##  $ BRAND          : chr  "SUPER-DUPER RECOVERY" "MYTHICAL BEVERAGE" "MYTHICAL BEVERAGE" "SUPER-DUPER RECOVERY" ...
##  $ PACKAGE        : chr  "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" ...
##  $ ITEM           : chr  "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA  JACK  CUP 16 LIQUID SMALL" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  CUP 16 LIQUID SMALL" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  CUP 16 LIQUID SMALL" "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA  JACK  CUP 16 LIQUID SMALL" ...
##  $ MONTH          : Factor w/ 12 levels "1","2","3","4",..: 7 1 2 10 11 8 3 3 6 6 ...
##  $ SEASON         : chr  "SUMMER" "WINTER" "WINTER" "FALL" ...
##  $ REGION         : chr  "CALI_NEVADA" "MOUNTAIN" "COLORADO" "MOUNTAIN" ...
print(unique(innovation$ITEM))
## [1] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA  JACK  CUP 16 LIQUID SMALL"                    
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  CUP 16 LIQUID SMALL"               
## [3] "SUPER-DUPER JUICED ENERGY DRINK CASAVA  SUNSET  GUAVA CUP 16 LIQUID SMALL"              
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA  NO ARTIFICIAL SWEETENERS CUP 16 LIQUID SMALL"
## [5] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  CUP 16 LIQUID SMALL X24"           
## [6] "SUPER-DUPER PURE ZERO ENERGY DRINK CASAVA  YELLOW  SUGAR FREE CUP 16 LIQUID SMALL"
# Count the number of unique PACKAGE column of our sample
table(innovation$PACKAGE)
## 
## 16SMALL 24ONE CUP 16SMALL MULTI CUP 
##                 2              5043
# Creating an 'innovation' data frame
#factor Region
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + REGION, data = innovation)
summary(model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + 
##     REGION, data = innovation)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1001.45   -11.82     1.66    16.23  1053.09 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               22.760063  70.086746   0.325  0.74539    
## UNIT_SALES                 2.294841   0.004102 559.512  < 2e-16 ***
## CALORIC_SEGMENT           -5.198822  57.246299  -0.091  0.92764    
## PACKAGE16SMALL MULTI CUP -22.051746  40.400212  -0.546  0.58521    
## REGIONCALI_NEVADA          2.091218   4.713024   0.444  0.65727    
## REGIONCOLORADO             4.879770   2.979656   1.638  0.10155    
## REGIONDESERT_SW           -1.086808   3.561049  -0.305  0.76023    
## REGIONKANSAS              -5.588681   7.329994  -0.762  0.44583    
## REGIONMOUNTAIN           -22.267932   3.006636  -7.406 1.52e-13 ***
## REGIONNEWMEXICO           -0.872840   4.491293  -0.194  0.84592    
## REGIONNOCAL                8.468503   4.357528   1.943  0.05202 .  
## REGIONNORTHERN           -10.153914   2.297398  -4.420 1.01e-05 ***
## REGIONPRAIRIE             -7.516506   5.472385  -1.374  0.16965    
## REGIONSOCAL                9.365306   3.471712   2.698  0.00701 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.1 on 5031 degrees of freedom
## Multiple R-squared:  0.9877, Adjusted R-squared:  0.9876 
## F-statistic: 3.099e+04 on 13 and 5031 DF,  p-value: < 2.2e-16

Cassava and Energy together do quite well (not possible to also add in DIET, but we will expect that folks that like Cassava Regular Energy will also like DIET). R2 of 0.98. Summer is statistically signficant, but negatively correlated with sales. SOCAL and NOCAL are significant in the positive direction.

#More exploration

library(dplyr)

small_group <- df %>%
  filter(UNIT_SALES < 7000, DOLLAR_SALES < 20000)

skim(small_group)
Data summary
Name small_group
Number of rows 2440779
Number of columns 13
_______________________
Column type frequency:
character 9
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
DATE 0 1 10 10 0 152 0
CATEGORY 0 1 3 18 0 5 0
MANUFACTURER 0 1 5 8 0 8 0
BRAND 0 1 4 56 0 286 0
PACKAGE 0 1 11 26 0 95 0
ITEM 0 1 26 142 0 2984 0
SEASON 0 1 4 6 0 4 0
REGION 0 1 5 11 0 11 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.50 0.50 0.00 0.00 1.00 1.00 1.0 ▇▁▁▁▇
UNIT_SALES 0 1 149.97 382.85 0.04 11.00 40.00 125.00 6998.0 ▇▁▁▁▁
DOLLAR_SALES 0 1 496.94 1222.46 0.01 36.36 134.34 423.52 19999.4 ▇▁▁▁▁
MONTH 0 1 6.29 3.43 1.00 3.00 6.00 9.00 12.0 ▇▆▆▅▇
skim(df %>% filter(BRAND == "DIET MOONLIT"))
Data summary
Name df %>% filter(BRAND == “D…
Number of rows 7587
Number of columns 13
_______________________
Column type frequency:
character 9
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
DATE 0 1 10 10 0 147 0
CATEGORY 0 1 3 3 0 1 0
MANUFACTURER 0 1 8 8 0 1 0
BRAND 0 1 12 12 0 1 0
PACKAGE 0 1 12 17 0 5 0
ITEM 0 1 50 63 0 5 0
SEASON 0 1 4 6 0 4 0
REGION 0 1 5 11 0 11 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
UNIT_SALES 0 1 98.51 290.67 1.00 23.00 51.00 92.00 4720.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 329.78 921.76 0.75 56.74 133.91 321.35 21447.56 ▇▁▁▁▁
MONTH 0 1 6.38 3.41 1.00 4.00 6.00 9.00 12.00 ▇▆▆▅▇

Our small df has a higher mean of unit sales and dollar sales of 149 and $496. as compared to the full df of DIET MOONLIT of 98 and $344.

# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(small_group, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
  geom_point(alpha = 0.5) +  # Adjust alpha to avoid overplotting, if necessary
  geom_smooth(method = "lm", color = "black", se = FALSE) +  # Add linear regression line without confidence band for clarity
  labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
       x = "UNTI SALES",
       y = "DOLLAR SALES") +
  theme_minimal() +
  theme(legend.position = "bottom")  # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'

Behold the realm of DIET MOONLIT. Certain items sell much better, or wosrse with consideration of slop of dollars to units sold. While most of its realm is in the lower left hand portion, other brands have sales through both its unit and dollar sales vectors.

#Make the small casava df > Investigating drinks with casava as a flavor in the Item description.

# Create a new data frame with only the rows where the ITEM column contains the word 'casava'
casava_small <- df[grep("casava", df$ITEM, ignore.case = TRUE), ]
skim(casava_small)
Data summary
Name casava_small
Number of rows 42420
Number of columns 13
_______________________
Column type frequency:
character 9
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
DATE 0 1 10 10 0 152 0
CATEGORY 0 1 3 18 0 4 0
MANUFACTURER 0 1 5 8 0 5 0
BRAND 0 1 5 26 0 28 0
PACKAGE 0 1 12 26 0 25 0
ITEM 0 1 46 112 0 83 0
SEASON 0 1 4 6 0 4 0
REGION 0 1 5 11 0 11 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.61 0.49 0.00 0.00 1.00 1.00 1.00 ▅▁▁▁▇
UNIT_SALES 0 1 73.35 183.67 1.00 9.00 30.00 79.00 6678.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 188.68 405.78 0.35 23.32 75.85 213.08 12886.48 ▇▁▁▁▁
MONTH 0 1 6.47 3.35 1.00 4.00 7.00 9.00 12.00 ▇▆▆▅▇

Casava has a much lower unit sales and dollar sales at 71 and $184 than Diet Moonlight at 98 and $344.

# casava small is dataframe
#factor Region
casava_small$REGION <- as.factor(casava_small$REGION)
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + CATEGORY + SEASON, data = casava_small)
summary(model)
## 
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + 
##     CATEGORY + SEASON, data = casava_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3376.3   -31.4    -3.6    26.7  7209.9 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        9.764e+01  3.544e+01   2.755 0.005872 ** 
## UNIT_SALES                         2.006e+00  4.145e-03 483.899  < 2e-16 ***
## CALORIC_SEGMENT                    1.222e+01  3.415e+01   0.358 0.720615    
## PACKAGE.5L 6ONE JUG               -4.559e+00  7.168e+00  -0.636 0.524707    
## PACKAGE.5L MULTI JUG              -3.817e+00  5.248e+01  -0.073 0.942017    
## PACKAGE12SMALL 12ONE CUP           2.202e+02  8.707e+00  25.291  < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG -2.275e+01  1.090e+02  -0.209 0.834658    
## PACKAGE12SMALL 4ONE PET           -3.135e+01  1.539e+02  -0.204 0.838570    
## PACKAGE12SMALL 6ONE CUP           -4.787e+01  3.128e+01  -1.530 0.125948    
## PACKAGE12SMALL 6ONE MEDIUM CUP    -1.622e+01  2.084e+01  -0.778 0.436308    
## PACKAGE12SMALL 8ONE BUMPY CUP      7.122e-01  1.845e+01   0.039 0.969204    
## PACKAGE12SMALL 8ONE CUP            1.194e+02  3.092e+01   3.860 0.000114 ***
## PACKAGE12SMALL MLT MEDIUM CUP     -3.776e+01  4.514e+01  -0.836 0.402961    
## PACKAGE12SMALL MLT PLASTICS JUG   -4.270e+01  9.496e+00  -4.497 6.91e-06 ***
## PACKAGE12SMALL MULTI CUP           3.034e+01  5.027e+01   0.604 0.546130    
## PACKAGE16SMALL 24ONE CUP          -1.069e+02  1.090e+02  -0.981 0.326806    
## PACKAGE16SMALL MLT SHADYES JUG    -3.776e+01  1.539e+02  -0.245 0.806142    
## PACKAGE16SMALL MULTI CUP          -9.307e+01  8.855e+00 -10.511  < 2e-16 ***
## PACKAGE18SMALL MULTI JUG          -4.791e+01  5.392e+00  -8.886  < 2e-16 ***
## PACKAGE1L MULTI JUG               -3.605e+01  1.345e+01  -2.680 0.007370 ** 
## PACKAGE20SMALL MULTI JUG          -2.529e+01  8.642e+00  -2.926 0.003435 ** 
## PACKAGE24 - 25SMALL MULTI JUG     -5.608e+01  6.753e+00  -8.303  < 2e-16 ***
## PACKAGE24SMALL MLT SHADYES JUG    -3.743e+01  1.996e+01  -1.875 0.060791 .  
## PACKAGE2L MULTI JUG               -7.211e+01  8.673e+00  -8.314  < 2e-16 ***
## PACKAGE3L MULTI JUG               -3.096e+01  5.497e+01  -0.563 0.573240    
## PACKAGE7.5SMALL 6ONE CUP          -3.071e+01  4.344e+01  -0.707 0.479554    
## PACKAGEALL OTHER ONES              2.381e+01  3.090e+01   0.771 0.440973    
## CATEGORYING ENHANCED WATER        -5.774e+01  3.507e+01  -1.646 0.099705 .  
## CATEGORYSPARKLING WATER           -1.305e+02  1.785e+01  -7.311 2.70e-13 ***
## CATEGORYSSD                       -7.457e+01  3.518e+00 -21.200  < 2e-16 ***
## SEASONSPRING                       1.251e+00  2.133e+00   0.586 0.557672    
## SEASONSUMMER                       6.951e+00  2.104e+00   3.303 0.000958 ***
## SEASONWINTER                      -3.287e+00  2.186e+00  -1.504 0.132585    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 153.6 on 42387 degrees of freedom
## Multiple R-squared:  0.8568, Adjusted R-squared:  0.8566 
## F-statistic:  7922 on 32 and 42387 DF,  p-value: < 2.2e-16

Our Casava small has a lower R2 of 0.85, but also contains much more data with nearly 42K observations compared to our innovation df at about 5 observations. There are many signficant features, but nothing that swings the needle in huge ways.

Cleaning

Reworking the subset casava for more feature engineering.

casava_small <- casava_small %>%
  mutate(
    PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"),  # Extracts the part from CUP or JUG to the end.
    ITEM = str_replace(ITEM, "(CUP|JUG).*", "")  # Replaces the CUP/JUG and everything after it with empty string in ITEM.
  )


#casava_small
casava_small <- casava_small %>%
  mutate(
    TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
    PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
    ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
    TEMP = NULL  # Removes the temporary column.
  )

#casava_small
na_rows <- casava_small %>%
  filter(is.na(PACKAGE2))
#na_rows
#the above steps excised all packaging out of ITEM column
casava_small <- casava_small %>%
  mutate(
    GENTLE_DRINK = if_else(str_detect(ITEM, "GENTLE DRINK"), 1, 0), # Assigns 1 if "GENTLE DRINK" exists, otherwise 0.
    ITEM = str_replace(ITEM, "GENTLE DRINK", "") # Removes "GENTLE DRINK" from ITEM.
  )
#casava_small
casava_small <- casava_small %>%
  mutate(
    ENERGY_DRINK = if_else(str_detect(ITEM, "ENERGY DRINK"), 1, 0), # Assigns 1 if "ENERGY DRINK" exists, otherwise 0.
    ITEM = str_replace(ITEM, "ENERGY DRINK", "") # Removes "ENERGY DRINK" from ITEM.
  )

#casava_small
library(dplyr)
library(stringr)

# Define the pattern as a regular expression
pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES"

casava_small <- casava_small %>%
  mutate(
    CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
    ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
  )

#casava_small
library(dplyr)
library(stringr)

casava_small <- casava_small %>%
  mutate(
    CALORIC_SEGMENT_TEXT = if_else(str_detect(ITEM, "\\bDIET\\b"), 
                                   if_else(is.na(CALORIC_SEGMENT_TEXT), "DIET", paste(CALORIC_SEGMENT_TEXT, "DIET", sep=", ")), 
                                   CALORIC_SEGMENT_TEXT)
  )
#casava_small
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
  words <- unlist(str_split(item, "\\s+")) # Split item into words
  unique_words <- unique(words) # Get unique words to check for repeats
  for (word in unique_words) {
    word_indices <- which(words == word) # Find all indices of the current word
    if (length(word_indices) > 1) { # If there is more than one occurrence
      words[word_indices[2]] <- "" # Remove the second occurrence
    }
  }
  return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}

# Apply the function to the 'ITEM' column
casava_small <- casava_small %>%
  mutate(ITEM = sapply(ITEM, remove_second_instance))


# Remove specific columns
casava_small <- select(casava_small, -PACKAGE2, -GENTLE_DRINK, -ENERGY_DRINK, -CALORIC_SEGMENT_TEXT)
head(casava_small)
##          MARKET_KEY       DATE CALORIC_SEGMENT        CATEGORY UNIT_SALES
## 18128439        784 2022-12-24               0 SPARKLING WATER         20
## 13183569         59 2022-07-23               1          ENERGY          1
## 9036653         441 2022-05-21               0 SPARKLING WATER         95
## 21010102        893 2021-05-08               1             SSD          9
## 23033055        965 2021-07-31               1             SSD         32
## 20054930         87 2021-05-15               0 SPARKLING WATER         20
##          DOLLAR_SALES MANUFACTURER                BRAND           PACKAGE
## 18128439        89.53       JOLLYS           BUBBLE JOY  12SMALL 8ONE CUP
## 13183569         2.49       JOLLYS SUPER-DUPER RECOVERY 16SMALL MULTI CUP
## 9036653        345.41       JOLLYS           BUBBLE JOY  12SMALL 8ONE CUP
## 21010102         9.00       JOLLYS        HILL MOISTURE 16SMALL MULTI CUP
## 23033055       107.31        COCOS            FANTASMIC      .5L 6ONE JUG
## 20054930        31.70     SWIRE-CC   GREETINGLE BUBBLES    ALL OTHER ONES
##                                                                              ITEM
## 18128439                                BUBBLE JOY SPARKLING WATER JACK CASAVA   
## 13183569                                        SUPER-DUPER RECOVERY CASAVA JACK 
## 9036653                                 BUBBLE JOY SPARKLING WATER JACK CASAVA   
## 21010102                                               RAINING MAUI BURST CASAVA 
## 23033055                                                        FANTASMIC CASAVA 
## 20054930 GREETINGLE BUBBLES HEALTH BEVERAGE PERU CASAVA NO ARTIFICIAL SWEETENERS 
##          MONTH SEASON      REGION
## 18128439    12 WINTER       NOCAL
## 13183569     7 SUMMER CALI_NEVADA
## 9036653      5 SPRING    MOUNTAIN
## 21010102     5 SPRING     ARIZONA
## 23033055     7 SUMMER    COLORADO
## 20054930     5 SPRING    MOUNTAIN

FINAL THOUGHTS

DIET MOONLIT has pretty decent sales at 69th place in total revenue. Casava is not the sexiest flavor in town, but with our innovation dataframe the R2 is quite high (although it is based on regular and no specific package type). There are some weeks that look great for 6 month predictions, it’s just a matter of deciding which ones to use.

Data Prep and XgBoost Model for best weeks()

df <- read_csv("swire_no_nas_w_pop.csv")  #inject the data and we will sub-sample
## Rows: 24461424 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): CALORIC_SEGMENT, CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM
## dbl  (4): MARKET_KEY, UNIT_SALES, DOLLAR_SALES, POP_SQMI
## date (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# #print unique package where BRAND is DIET MOONLIT
# unique(df$PACKAGE[df$BRAND == "DIET MOONLIT"])
# 
# #print count of CATEGORY = ENERGY with 2L in PACKAGE or ITEM
# table(df$CATEGORY[df$PACKAGE == "2L MULTI JUG" | df$ITEM == "2L MULTI JUG" | df$PACKAGE == "2L MULTI JUG" | df$ITEM == "2L MULTI JUG"])

#sales of BRAND=="MOONLIT"at Max date
df %>% 
  filter(BRAND == "MOONLIT") %>% 
    arrange(desc(DATE)) %>% 
  summarise(UNIT_SALES = sum(UNIT_SALES)) %>% 
  head(1)
## # A tibble: 1 × 1
##   UNIT_SALES
##        <dbl>
## 1  26837843.
df %>% 
  filter(BRAND == "MOONLIT") %>% 
  arrange((DATE)) %>% 
  group_by(DATE) %>%
  summarize(sum(UNIT_SALES)) %>% 
  head(1)
## # A tibble: 1 × 2
##   DATE       `sum(UNIT_SALES)`
##   <date>                 <dbl>
## 1 2020-12-05            172170
#Group by ITEM with DATE Before 2021-01-01, drop those ITEM rows
df_long_running <- df %>%
  group_by(ITEM) %>%
  filter(DATE <= "2021-01-01")

#remove all rows in casava_long_running from casava
df <- df %>%
  anti_join(df_long_running)
## Joining with `by = join_by(MARKET_KEY, DATE, CALORIC_SEGMENT, CATEGORY,
## UNIT_SALES, DOLLAR_SALES, MANUFACTURER, BRAND, PACKAGE, ITEM, POP_SQMI)`
#Group by ITEM rows with less than 20 weeks of data
df_small <- df %>%
  group_by(ITEM) %>%
  filter(n() <= 20)

#remove all rows in casava_long_running from casava
df <- df %>%
  anti_join(df_small)
## Joining with `by = join_by(MARKET_KEY, DATE, CALORIC_SEGMENT, CATEGORY,
## UNIT_SALES, DOLLAR_SALES, MANUFACTURER, BRAND, PACKAGE, ITEM, POP_SQMI)`
#Drop rows after May 21st 2023 as there are several gaps for most brands in innovation casava
df <- df %>%
  filter(DATE <= "2023-05-21")

#cleanup everything but df
rm(df_long_running, df_small)

#skim(df)
# Assuming df is your dataframe
# set.seed(123) # Set a random seed for reproducibility
# sampled_df <- df[sample(1:nrow(df), 2446143), ]
# rm(df)
# df <- sampled_df
# rm(sampled_df)
#skim(df)
#summary(df)
#casava CASAVA and MOONLIT (regular)
casava <- df %>%
#  filter(PACKAGE == "2L MULTI JUG",
         filter(
         str_detect(ITEM, "CASAVA"),
         BRAND == "MOONLIT")

#skim(casava)
#DIET MOONLIT where package is either 2L MULTI JUG or 12SMALL MULTI CUP or 16SMALL 24ONE CUP
diet_moonlit <- df %>%
  filter(PACKAGE %in% c("2L MULTI JUG", "12SMALL", "16SMALL") &
           BRAND == "DIET MOONLIT")



#skim(diet_moonlit)
#CASAVA and ENERGY
energy <- df %>% 
  filter(CATEGORY == "ENERGY",
         str_detect(ITEM, "CASAVA")
  )
# #Diet Energy - too much noise
diet_energy <- df %>% 
    filter(CATEGORY == "ENERGY",
            CALORIC_SEGMENT == 0)
# Merge the data frames
merged_innovation_df <- bind_rows(casava, diet_moonlit, energy, diet_energy)
#merged_innovation_df <- bind_rows(casava, diet_moonlit, energy)
#remove duplicate rows
df <- merged_innovation_df %>% distinct()

skim(df)
Data summary
Name df
Number of rows 72504
Number of columns 11
_______________________
Column type frequency:
character 6
Date 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
CALORIC_SEGMENT 0 1 7 10 0 2 0
CATEGORY 0 1 3 6 0 2 0
MANUFACTURER 0 1 5 8 0 3 0
BRAND 0 1 7 25 0 7 0
PACKAGE 0 1 12 17 0 5 0
ITEM 0 1 48 87 0 9 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DATE 0 1 2021-01-02 2023-05-20 2022-07-02 125

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
MARKET_KEY 0 1 581.92 587.35 1.00 260.00 536.00 831.00 6802.00 ▇▁▁▁▁
UNIT_SALES 0 1 90.60 239.33 1.00 17.00 46.00 99.00 5349.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 179.87 462.39 0.25 30.64 82.84 186.83 11270.01 ▇▁▁▁▁
POP_SQMI 0 1 1383.59 1772.35 0.18 35.12 349.46 2474.41 6769.35 ▇▂▂▁▁
regions_joinme <- read.csv("states_summary.csv")

unique(regions_joinme$REGION)
##  [1] "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA" "MOUNTAIN"   
##  [6] "SOCAL"       "ARIZONA"     "NEWMEXICO"   "NOCAL"       "COLORADO"   
## [11] "KANSAS"
# "NORTHERN"    "DESERT_SW"   "PRAIRIE"     "CALI_NEVADA"  "MOUNTAIN"    "SOCAL"   "ARIZONA"    "NEWMEXICO"   "NOCAL"    "COLORADO"    "KANSAS" 

str(regions_joinme)
## 'data.frame':    200 obs. of  2 variables:
##  $ MARKET_KEY: int  13 70 179 197 272 352 32 33 44 50 ...
##  $ REGION    : chr  "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
  mutate(
    MONTH = as.numeric(substr(DATE, 6, 7)),  # Extract the month from YYYY-MM-DD format
    SEASON = case_when(
      MONTH %in% c(12, 01, 02) ~ "WINTER",
      MONTH %in% c(03, 04, 05) ~ "SPRING",
      MONTH %in% c(06, 07, 08) ~ "SUMMER",
      MONTH %in% c(09, 10, 11) ~ "FALL",
      TRUE ~ NA_character_  # This is just in case there are any undefined values
    )
  )

Cleaning Casava

#save merged_innovation_df back to casava
casava <- df

skim(casava)
Data summary
Name casava
Number of rows 72504
Number of columns 14
_______________________
Column type frequency:
character 8
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MARKET_KEY 0 1 1 4 0 200 0
CATEGORY 0 1 3 6 0 2 0
MANUFACTURER 0 1 5 8 0 3 0
BRAND 0 1 7 25 0 7 0
PACKAGE 0 1 12 17 0 5 0
ITEM 0 1 48 87 0 9 0
REGION 0 1 5 11 0 11 0
SEASON 0 1 4 6 0 4 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DATE 0 1 2021-01-02 2023-05-20 2022-07-02 125

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
UNIT_SALES 0 1 90.60 239.33 1.00 17.00 46.00 99.00 5349.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 179.87 462.39 0.25 30.64 82.84 186.83 11270.01 ▇▁▁▁▁
POP_SQMI 0 1 1383.59 1772.35 0.18 35.12 349.46 2474.41 6769.35 ▇▂▂▁▁
MONTH 0 1 6.37 3.53 1.00 3.00 6.00 10.00 12.00 ▇▆▅▅▇

Reworking the subset Casava for more feature engineering.

casava <- casava %>%
  mutate(
    PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"),  # Extracts the part from CUP or JUG to the end.
    ITEM = str_replace(ITEM, "(CUP|JUG).*", "")  # Replaces the CUP/JUG and everything after it with empty string in ITEM.
  )
casava <- casava %>%
  mutate(
    TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
    PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
    ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
    TEMP = NULL  # Removes the temporary column.
  )
na_rows <- casava %>%
  filter(is.na(PACKAGE2))
na_rows
##  [1] MARKET_KEY      DATE            CALORIC_SEGMENT CATEGORY       
##  [5] UNIT_SALES      DOLLAR_SALES    MANUFACTURER    BRAND          
##  [9] PACKAGE         ITEM            POP_SQMI        REGION         
## [13] MONTH           SEASON          PACKAGE2       
## <0 rows> (or 0-length row.names)
#the above steps excised all packaging out of ITEM column
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
  words <- unlist(str_split(item, "\\s+")) # Split item into words
  unique_words <- unique(words) # Get unique words to check for repeats
  for (word in unique_words) {
    word_indices <- which(words == word) # Find all indices of the current word
    if (length(word_indices) > 1) { # If there is more than one occurrence
      words[word_indices[2]] <- "" # Remove the second occurrence
    }
  }
  return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}

# Apply the function to the 'ITEM' column
casava <- casava %>%
  mutate(ITEM = sapply(ITEM, remove_second_instance))
# #One hot encode either "ENERGY" or "ED" in ITEM as an ENERGY_DRINK
# casava$ENERGY_DRINK <- ifelse(str_detect(casava$ITEM, "ENERGY|' ED'"), 1, 0)
# 
# casava$ITEM <- str_replace(casava$ITEM, "ENERGY DRINK", "")
# casava$ITEM <- str_replace(casava$ITEM, "ENERGY", "")
# casava$ITEM <- str_replace(casava$ITEM, " ED", "")
# table(casava$ENERGY_DRINK)
# 
# table(casava$CATEGORY)
# 
# casava %>% 
#   filter(ENERGY_DRINK == 1,
#          CATEGORY=='SSD') %>% 
#   select(ITEM) %>% 
#   head(10)
# Remove specific columns
#casava <- select(casava, -PACKAGE2, -CATEGORY)
head(casava)
##   MARKET_KEY       DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1          1 2022-09-17               1   ENERGY         21        38.51
## 2          1 2021-09-18               1   ENERGY         27        45.03
## 3          1 2022-11-05               1   ENERGY         33        91.62
## 4          1 2023-04-29               1   ENERGY         54        96.99
## 5          1 2023-04-01               1   ENERGY         23        56.79
## 6          1 2023-04-29               1   ENERGY         24        53.46
##   MANUFACTURER                BRAND           PACKAGE
## 1       JOLLYS   SUPER-DUPER JUICED 16SMALL MULTI CUP
## 2       JOLLYS   SUPER-DUPER JUICED 16SMALL MULTI CUP
## 3        PONYS    MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 4       JOLLYS SUPER-DUPER RECOVERY 16SMALL MULTI CUP
## 5        PONYS    MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 6        PONYS    MYTHICAL BEVERAGE 16SMALL MULTI CUP
##                                                   ITEM POP_SQMI   REGION MONTH
## 1 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA  1.201114 NORTHERN     9
## 2 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA  1.201114 NORTHERN     9
## 3 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  1.201114 NORTHERN    11
## 4       SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK  1.201114 NORTHERN     4
## 5 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  1.201114 NORTHERN     4
## 6 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  1.201114 NORTHERN     4
##   SEASON               PACKAGE2
## 1   FALL CUP 16 LIQUID SMALL NA
## 2   FALL CUP 16 LIQUID SMALL NA
## 3   FALL CUP 16 LIQUID SMALL NA
## 4 SPRING CUP 16 LIQUID SMALL NA
## 5 SPRING CUP 16 LIQUID SMALL NA
## 6 SPRING CUP 16 LIQUID SMALL NA
table(casava$ITEM)
## 
##                               JUMPIN-FISH ENERGY DRINK CASAVA JACK  
##                                                                   8 
##                                        MOONLIT GENTLE DRINK CASAVA  
##                                                                1540 
##                                        MOONLIT GENTLE DRINK SUNSET  
##                                                               23881 
##                MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA  
##                                                               17089 
##                SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA  
##                                                               16930 
## SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS  
##                                                                4774 
##                      SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK  
##                                                                8282
#write.csv(casava_small, "casava_yellow.csv", row.names = FALSE)
#write.csv(diet_moonlit_df, "diet_moonlit.csv", row.names = FALSE)
#Trim trailing white space at end of ITEM
casava$ITEM <- str_trim(casava$ITEM, side = "right")
# #replace "GENTLE DRINK" with "" in ITEM
# casava$ITEM <- str_replace(casava$ITEM, "GENTLE DRINK", "")
#One hot encode "NO ARTIFICAL SWEETNERS" in ITEM
# casava$NO_ARTIFICIAL_SWEETNERS <- ifelse(str_detect(casava$ITEM, 
#                                                     "NO ARTIFICIAL SWEETENERS"), 
#                                          1, 0)
# 
# table(casava$NO_ARTIFICIAL_SWEETNERS)
# # #Remove "NO ARTIFICIAL SWEETNERS" from ITEM
# casava$ITEM <- str_replace(casava$ITEM, "NO ARTIFICIAL SWEETENERS", "")
#Remove Health Supplement rows
# casava <- casava %>%
#   filter(!str_detect(ITEM, "HEALTH SUPPLEMENT"))
# pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES|ZERO CARB|PURE ZERO|DIET"
# 
# casava <- casava %>%
#   mutate(
#     CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
#     ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
#   )
#remove mythical beverage - monster reserve casava (pineapple)
#casava <- casava %>%
#  filter(!str_detect(ITEM, "MYTHICAL BEVERAGE"))

# Remove JUMPING JACK - sporadic single week
casava <- casava %>%
 filter(!str_detect(ITEM, "JUMPIN-FISH  CASAVA JACK"))

#Remove "SUPER-DUPER CASAVA YELLOW"
# casava <- casava %>%
#   filter(!str_detect(ITEM, "SUPER-DUPER PURE CASAVA YELLOW"))


#drop row with MOONLIT SUNSET in ITEM
casava <- casava %>%
  filter(!str_detect(ITEM, "MOONLIT  SUNSET"))
print(unique(casava$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"               
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA"               
## [3] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"                     
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
## [5] "MOONLIT GENTLE DRINK SUNSET"                                       
## [6] "MOONLIT GENTLE DRINK CASAVA"                                       
## [7] "JUMPIN-FISH ENERGY DRINK CASAVA JACK"
# Find the minimum launch date for each product
min_launch_dates <- casava %>%
  group_by(ITEM) %>%
  summarise(min_launch_date = min(DATE))

# Join the minimum launch dates back to the original data
casava <- casava %>%
  left_join(min_launch_dates, by = "ITEM")

# Calculate the number of weeks since the product launch
casava <- casava %>%
  mutate(WEEKS_SINCE_LAUNCH = as.numeric(difftime(DATE, min_launch_date, units = "weeks")))

# Selecting required columns and printing the first 10 rows
casava %>%
  filter(UNIT_SALES > 0) %>%
  select(DATE, ITEM, WEEKS_SINCE_LAUNCH) %>%
  head(10)
##          DATE                                                ITEM
## 1  2022-09-17 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 2  2021-09-18 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 3  2022-11-05 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 4  2023-04-29       SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 5  2023-04-01 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 6  2023-04-29 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 7  2022-05-14 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 8  2022-05-07 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 9  2022-09-03 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 10 2022-10-29 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
##    WEEKS_SINCE_LAUNCH
## 1                  89
## 2                  37
## 3                  60
## 4                  45
## 5                  81
## 6                  85
## 7                  35
## 8                  70
## 9                  51
## 10                 95
#Subtract before April 19th from WEEKS_SINCE_LAUNCH where ITEM === "SUPER-DUPER JUICED  CASAVA SUNSET GUAVA" actually launches and takes off, replace anything less than 0 with 0.

#Remove rows from ITEM == "SUPER-DUPER JUICED  CASAVA SUNSET GUAVA" before April 19th 2021 when something about it actually launches
casava <- casava %>%
  filter(!(ITEM == "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" & DATE < as.Date("2021-04-19")))

#Subtract WEEKS SINCE LAUNCH for ITEM == "SUPER-DUPER JUICED  CASAVA SUNSET GUAVA" by 16 weeks, so real launch date matches up.
casava <- casava %>%
  mutate(WEEKS_SINCE_LAUNCH = ifelse(ITEM == "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA", WEEKS_SINCE_LAUNCH - 16, WEEKS_SINCE_LAUNCH))

#Set any negative WEEKS_SINCE_LAUNCH to 0
casava <- casava %>%
  mutate(WEEKS_SINCE_LAUNCH = ifelse(WEEKS_SINCE_LAUNCH < 0, 0, WEEKS_SINCE_LAUNCH))

#min date of ITEM "SUPER-DUPER JUICED  CASAVA SUNSET GUAVA"
casava %>% 
  filter(ITEM == "SUPER-DUPER JUICED  CASAVA SUNSET GUAVA") %>%
  summarise(min(DATE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `min(DATE)`.
## Caused by warning in `min.default()`:
## ! no non-missing arguments to min; returning Inf
##   min(DATE)
## 1       Inf
print(unique(casava$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"               
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA"               
## [3] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"                     
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
## [5] "MOONLIT GENTLE DRINK SUNSET"                                       
## [6] "MOONLIT GENTLE DRINK CASAVA"                                       
## [7] "JUMPIN-FISH ENERGY DRINK CASAVA JACK"
print(unique(casava$BRAND))
## [1] "SUPER-DUPER JUICED"        "MYTHICAL BEVERAGE"        
## [3] "SUPER-DUPER RECOVERY"      "SUPER-DUPER PUNCHED"      
## [5] "DIET MOONLIT"              "MOONLIT"                  
## [7] "HILL MOISTURE JUMPIN-FISH"
print(unique(casava$CATEGORY))
## [1] "ENERGY" "SSD"
print(unique(casava$PACKAGE))
## [1] "16SMALL MULTI CUP" "2L MULTI JUG"      "20SMALL MULTI JUG"
## [4] "16SMALL 24ONE CUP" "12SMALL MULTI CUP"
#print(unique(casava$CALORIC_SEGMENT_TEXT))
print(unique(casava$CALORIC_SEGMENT))
## [1] 1 0
#What percent of UNIT SALE are "2L MULTI JUG"
casava %>%
  filter(PACKAGE == "2L MULTI JUG") %>%
  summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
  mutate(PERCENTAGE = UNIT_SALES / sum(casava$UNIT_SALES) * 100)
##   UNIT_SALES PERCENTAGE
## 1    2174965   33.10922
#What percent of UNIT SALE are "DIET"
casava %>%
  filter(CALORIC_SEGMENT == "DIET") %>%
  summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
  mutate(PERCENTAGE = UNIT_SALES / sum(casava$UNIT_SALES) * 100)
##   UNIT_SALES PERCENTAGE
## 1          0          0
#Test removing ITEMS containing MOONLIT CASAVA and MYSTICAL BEVERAGE
# unique(casava$BRAND)
# casava <- casava %>%
#   filter(!BRAND=="MOONLIT",
#          !BRAND=="MYTHICAL BEVERAGE")
# unique(casava$BRAND)
# 
 write_csv(casava, "casava_tableau.csv")

str(casava)
## 'data.frame':    72455 obs. of  17 variables:
##  $ MARKET_KEY        : chr  "1" "1" "1" "1" ...
##  $ DATE              : Date, format: "2022-09-17" "2021-09-18" ...
##  $ CALORIC_SEGMENT   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY          : chr  "ENERGY" "ENERGY" "ENERGY" "ENERGY" ...
##  $ UNIT_SALES        : num  21 27 33 54 23 24 40 46 44 20 ...
##  $ DOLLAR_SALES      : num  38.5 45 91.6 97 56.8 ...
##  $ MANUFACTURER      : chr  "JOLLYS" "JOLLYS" "PONYS" "JOLLYS" ...
##  $ BRAND             : chr  "SUPER-DUPER JUICED" "SUPER-DUPER JUICED" "MYTHICAL BEVERAGE" "SUPER-DUPER RECOVERY" ...
##  $ PACKAGE           : chr  "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" ...
##  $ ITEM              : chr  "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA" "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK" ...
##  $ POP_SQMI          : num  1.2 1.2 1.2 1.2 1.2 ...
##  $ REGION            : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ MONTH             : num  9 9 11 4 4 4 5 5 9 10 ...
##  $ SEASON            : chr  "FALL" "FALL" "FALL" "SPRING" ...
##  $ PACKAGE2          : chr  "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" ...
##  $ min_launch_date   : Date, format: "2021-01-02" "2021-01-02" ...
##  $ WEEKS_SINCE_LAUNCH: num  73 21 60 45 81 85 35 54 51 79 ...
#remove all objects other than casava
rm(list = setdiff(ls(), "casava"))

print(unique(casava$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"               
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA"               
## [3] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"                     
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
## [5] "MOONLIT GENTLE DRINK SUNSET"                                       
## [6] "MOONLIT GENTLE DRINK CASAVA"                                       
## [7] "JUMPIN-FISH ENERGY DRINK CASAVA JACK"
print(unique(casava$BRAND))
## [1] "SUPER-DUPER JUICED"        "MYTHICAL BEVERAGE"        
## [3] "SUPER-DUPER RECOVERY"      "SUPER-DUPER PUNCHED"      
## [5] "DIET MOONLIT"              "MOONLIT"                  
## [7] "HILL MOISTURE JUMPIN-FISH"
print(unique(casava$CATEGORY))
## [1] "ENERGY" "SSD"
print(unique(casava$PACKAGE))
## [1] "16SMALL MULTI CUP" "2L MULTI JUG"      "20SMALL MULTI JUG"
## [4] "16SMALL 24ONE CUP" "12SMALL MULTI CUP"
#what caloric segment is ITEM "MOONLIT SUNSET"
casava %>% 
  filter(ITEM == "MOONLIT SUNSET")
##  [1] MARKET_KEY         DATE               CALORIC_SEGMENT    CATEGORY          
##  [5] UNIT_SALES         DOLLAR_SALES       MANUFACTURER       BRAND             
##  [9] PACKAGE            ITEM               POP_SQMI           REGION            
## [13] MONTH              SEASON             PACKAGE2           min_launch_date   
## [17] WEEKS_SINCE_LAUNCH
## <0 rows> (or 0-length row.names)
# Creating an 'innovation' data frame

#factor all character variables
casava$REGION <- as.factor(casava$REGION)
#casava$CATEGORY <- as.factor(casava$CATEGORY)
#casava$BRAND <- as.factor(casava$BRAND)
casava$SEASON <- as.factor(casava$SEASON)
casava$PACKAGE2 <- as.factor(casava$PACKAGE2)



#model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + POP_SQMI + REGION + CATEGORY  + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# model <- lm(DOLLAR_SALES ~ UNIT_SALES + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# summary(model)
# Creating an 'innovation' data frame

#model <- lm(UNIT_SALES ~ DOLLAR_SALES + CALORIC_SEGMENT + PACKAGE + POP_SQMI + REGION + CATEGORY  + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# model <- lm(UNIT_SALES ~ DOLLAR_SALES + PACKAGE + POP_SQMI + REGION  + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# summary(model)
# Creating an 'innovation' data frame

# #model <- lm(UNIT_SALES ~  CALORIC_SEGMENT + PACKAGE + POP_SQMI + REGION + CATEGORY  + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# model <- lm(UNIT_SALES ~   + PACKAGE + POP_SQMI + REGION  + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# summary(model)
# Load and prepare dataset
df <- read.csv("casava_tableau.csv") 
# Load and prepare dataset 

str(df)
## 'data.frame':    72455 obs. of  17 variables:
##  $ MARKET_KEY        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE              : chr  "2022-09-17" "2021-09-18" "2022-11-05" "2023-04-29" ...
##  $ CALORIC_SEGMENT   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ CATEGORY          : chr  "ENERGY" "ENERGY" "ENERGY" "ENERGY" ...
##  $ UNIT_SALES        : int  21 27 33 54 23 24 40 46 44 20 ...
##  $ DOLLAR_SALES      : num  38.5 45 91.6 97 56.8 ...
##  $ MANUFACTURER      : chr  "JOLLYS" "JOLLYS" "PONYS" "JOLLYS" ...
##  $ BRAND             : chr  "SUPER-DUPER JUICED" "SUPER-DUPER JUICED" "MYTHICAL BEVERAGE" "SUPER-DUPER RECOVERY" ...
##  $ PACKAGE           : chr  "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" ...
##  $ ITEM              : chr  "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA" "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK" ...
##  $ POP_SQMI          : num  1.2 1.2 1.2 1.2 1.2 ...
##  $ REGION            : chr  "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
##  $ MONTH             : int  9 9 11 4 4 4 5 5 9 10 ...
##  $ SEASON            : chr  "FALL" "FALL" "FALL" "SPRING" ...
##  $ PACKAGE2          : chr  "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" ...
##  $ min_launch_date   : chr  "2021-01-02" "2021-01-02" "2021-09-11" "2022-06-18" ...
##  $ WEEKS_SINCE_LAUNCH: int  73 21 60 45 81 85 35 54 51 79 ...
casava <- df %>% 
  #select(-DATE, -MONTH, -SEASON, -BRAND, -REGION, -ITEM )
  #select(-MONTH, -SEASON, -min_launch_date, -PACKAGE2, -CALORIC_SEGMENT_TEXT)
  select(-MONTH, -SEASON, -min_launch_date, -PACKAGE2)
# Assuming casava is your data frame and PACKAGING is the column of interest

# Create new columns in woodsy for each unique substring
# Each column will have a 1 if the substring is found in the PACKAGING column, 0 otherwise

casava$`16SMALL MULTI CUP` = as.integer(grepl("16SMALL MULTI CUP", casava$PACKAGE))
casava$`20SMALL MULTI JUG` = as.integer(grepl("20SMALL MULTI JUG", casava$PACKAGE))
casava$`16SMALL 24ONE CUP` = as.integer(grepl("16SMALL 24ONE CUP", casava$PACKAGE))
casava$`2L MULTI JUG` = as.integer(grepl("2L MULTI JUG", casava$PACKAGE))

     

#one hot encode non brand ITEM strings
casava$SUNSET = as.integer(grepl("SUNSET", casava$ITEM))
casava$BLAST = as.integer(grepl("BLAST", casava$ITEM))
casava$JUICED = as.integer(grepl("JUICED", casava$ITEM))
casava$GUAVA = as.integer(grepl("GUAVA", casava$ITEM))
casava$RECOVERY = as.integer(grepl("RECOVERY", casava$ITEM))
casava$JACK = as.integer(grepl("JACK", casava$ITEM))
casava$RESERVE= as.integer(grepl("RESERVE", casava$ITEM))
casava$WHITE= as.integer(grepl("WHITE", casava$ITEM))
casava$PITAYA= as.integer(grepl("PITAYA", casava$ITEM))
casava$ED= as.integer(grepl("ED", casava$ITEM))
casava$CASAVA= as.integer(grepl("CASAVA", casava$ITEM)) 

# Print the head of the data frame to see the first few rows
head(casava)
##   MARKET_KEY       DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1          1 2022-09-17               1   ENERGY         21        38.51
## 2          1 2021-09-18               1   ENERGY         27        45.03
## 3          1 2022-11-05               1   ENERGY         33        91.62
## 4          1 2023-04-29               1   ENERGY         54        96.99
## 5          1 2023-04-01               1   ENERGY         23        56.79
## 6          1 2023-04-29               1   ENERGY         24        53.46
##   MANUFACTURER                BRAND           PACKAGE
## 1       JOLLYS   SUPER-DUPER JUICED 16SMALL MULTI CUP
## 2       JOLLYS   SUPER-DUPER JUICED 16SMALL MULTI CUP
## 3        PONYS    MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 4       JOLLYS SUPER-DUPER RECOVERY 16SMALL MULTI CUP
## 5        PONYS    MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 6        PONYS    MYTHICAL BEVERAGE 16SMALL MULTI CUP
##                                                  ITEM POP_SQMI   REGION
## 1 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 1.201114 NORTHERN
## 2 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 1.201114 NORTHERN
## 3 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN
## 4       SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK 1.201114 NORTHERN
## 5 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN
## 6 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN
##   WEEKS_SINCE_LAUNCH 16SMALL MULTI CUP 20SMALL MULTI JUG 16SMALL 24ONE CUP
## 1                 73                 1                 0                 0
## 2                 21                 1                 0                 0
## 3                 60                 1                 0                 0
## 4                 45                 1                 0                 0
## 5                 81                 1                 0                 0
## 6                 85                 1                 0                 0
##   2L MULTI JUG SUNSET BLAST JUICED GUAVA RECOVERY JACK RESERVE WHITE PITAYA ED
## 1            0      1     0      1     1        0    0       0     0      0  1
## 2            0      1     0      1     1        0    0       0     0      0  1
## 3            0      0     0      0     0        0    0       1     1      0  0
## 4            0      0     0      0     0        1    1       0     0      0  0
## 5            0      0     0      0     0        0    0       1     1      0  0
## 6            0      0     0      0     0        0    0       1     1      0  0
##   CASAVA
## 1      1
## 2      1
## 3      1
## 4      1
## 5      1
## 6      1
casava$CATEGORY <- NULL
casava$MARKET_KEY <- NULL
casava$MANUFACTURER <- NULL
casava$PACKAGE <- NULL
library(fastDummies)

# One-hot encode the specified columns
casava <- fastDummies::dummy_cols(casava, select_columns = c("REGION", "ITEM"), remove_selected_columns = TRUE)
#casava <- fastDummies::dummy_cols(casava, select_columns = c("REGION", "SEASON","ITEM"), remove_selected_columns = TRUE)

# View the first few rows to verify the changes
head(casava)
##         DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES                BRAND
## 1 2022-09-17               1         21        38.51   SUPER-DUPER JUICED
## 2 2021-09-18               1         27        45.03   SUPER-DUPER JUICED
## 3 2022-11-05               1         33        91.62    MYTHICAL BEVERAGE
## 4 2023-04-29               1         54        96.99 SUPER-DUPER RECOVERY
## 5 2023-04-01               1         23        56.79    MYTHICAL BEVERAGE
## 6 2023-04-29               1         24        53.46    MYTHICAL BEVERAGE
##   POP_SQMI WEEKS_SINCE_LAUNCH 16SMALL MULTI CUP 20SMALL MULTI JUG
## 1 1.201114                 73                 1                 0
## 2 1.201114                 21                 1                 0
## 3 1.201114                 60                 1                 0
## 4 1.201114                 45                 1                 0
## 5 1.201114                 81                 1                 0
## 6 1.201114                 85                 1                 0
##   16SMALL 24ONE CUP 2L MULTI JUG SUNSET BLAST JUICED GUAVA RECOVERY JACK
## 1                 0            0      1     0      1     1        0    0
## 2                 0            0      1     0      1     1        0    0
## 3                 0            0      0     0      0     0        0    0
## 4                 0            0      0     0      0     0        1    1
## 5                 0            0      0     0      0     0        0    0
## 6                 0            0      0     0      0     0        0    0
##   RESERVE WHITE PITAYA ED CASAVA REGION_ARIZONA REGION_CALI_NEVADA
## 1       0     0      0  1      1              0                  0
## 2       0     0      0  1      1              0                  0
## 3       1     1      0  0      1              0                  0
## 4       0     0      0  0      1              0                  0
## 5       1     1      0  0      1              0                  0
## 6       1     1      0  0      1              0                  0
##   REGION_COLORADO REGION_DESERT_SW REGION_KANSAS REGION_MOUNTAIN
## 1               0                0             0               0
## 2               0                0             0               0
## 3               0                0             0               0
## 4               0                0             0               0
## 5               0                0             0               0
## 6               0                0             0               0
##   REGION_NEWMEXICO REGION_NOCAL REGION_NORTHERN REGION_PRAIRIE REGION_SOCAL
## 1                0            0               1              0            0
## 2                0            0               1              0            0
## 3                0            0               1              0            0
## 4                0            0               1              0            0
## 5                0            0               1              0            0
## 6                0            0               1              0            0
##   ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK ITEM_MOONLIT GENTLE DRINK CASAVA
## 1                                         0                                0
## 2                                         0                                0
## 3                                         0                                0
## 4                                         0                                0
## 5                                         0                                0
## 6                                         0                                0
##   ITEM_MOONLIT GENTLE DRINK SUNSET
## 1                                0
## 2                                0
## 3                                0
## 4                                0
## 5                                0
## 6                                0
##   ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 1                                                        0
## 2                                                        0
## 3                                                        1
## 4                                                        0
## 5                                                        1
## 6                                                        1
##   ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 1                                                        1
## 2                                                        1
## 3                                                        0
## 4                                                        0
## 5                                                        0
## 6                                                        0
##   ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 1                                                                       0
## 2                                                                       0
## 3                                                                       0
## 4                                                                       0
## 5                                                                       0
## 6                                                                       0
##   ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 1                                                  0
## 2                                                  0
## 3                                                  0
## 4                                                  1
## 5                                                  0
## 6                                                  0
write.csv(casava, "casava_one_hot.csv", row.names = FALSE)
library(fastDummies)

# One-hot encode
casava <- fastDummies::dummy_cols(casava, select_columns = "BRAND", remove_selected_columns = TRUE)

# View the first few rows to verify
head(casava)
##         DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES POP_SQMI
## 1 2022-09-17               1         21        38.51 1.201114
## 2 2021-09-18               1         27        45.03 1.201114
## 3 2022-11-05               1         33        91.62 1.201114
## 4 2023-04-29               1         54        96.99 1.201114
## 5 2023-04-01               1         23        56.79 1.201114
## 6 2023-04-29               1         24        53.46 1.201114
##   WEEKS_SINCE_LAUNCH 16SMALL MULTI CUP 20SMALL MULTI JUG 16SMALL 24ONE CUP
## 1                 73                 1                 0                 0
## 2                 21                 1                 0                 0
## 3                 60                 1                 0                 0
## 4                 45                 1                 0                 0
## 5                 81                 1                 0                 0
## 6                 85                 1                 0                 0
##   2L MULTI JUG SUNSET BLAST JUICED GUAVA RECOVERY JACK RESERVE WHITE PITAYA ED
## 1            0      1     0      1     1        0    0       0     0      0  1
## 2            0      1     0      1     1        0    0       0     0      0  1
## 3            0      0     0      0     0        0    0       1     1      0  0
## 4            0      0     0      0     0        1    1       0     0      0  0
## 5            0      0     0      0     0        0    0       1     1      0  0
## 6            0      0     0      0     0        0    0       1     1      0  0
##   CASAVA REGION_ARIZONA REGION_CALI_NEVADA REGION_COLORADO REGION_DESERT_SW
## 1      1              0                  0               0                0
## 2      1              0                  0               0                0
## 3      1              0                  0               0                0
## 4      1              0                  0               0                0
## 5      1              0                  0               0                0
## 6      1              0                  0               0                0
##   REGION_KANSAS REGION_MOUNTAIN REGION_NEWMEXICO REGION_NOCAL REGION_NORTHERN
## 1             0               0                0            0               1
## 2             0               0                0            0               1
## 3             0               0                0            0               1
## 4             0               0                0            0               1
## 5             0               0                0            0               1
## 6             0               0                0            0               1
##   REGION_PRAIRIE REGION_SOCAL ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK
## 1              0            0                                         0
## 2              0            0                                         0
## 3              0            0                                         0
## 4              0            0                                         0
## 5              0            0                                         0
## 6              0            0                                         0
##   ITEM_MOONLIT GENTLE DRINK CASAVA ITEM_MOONLIT GENTLE DRINK SUNSET
## 1                                0                                0
## 2                                0                                0
## 3                                0                                0
## 4                                0                                0
## 5                                0                                0
## 6                                0                                0
##   ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 1                                                        0
## 2                                                        0
## 3                                                        1
## 4                                                        0
## 5                                                        1
## 6                                                        1
##   ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 1                                                        1
## 2                                                        1
## 3                                                        0
## 4                                                        0
## 5                                                        0
## 6                                                        0
##   ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 1                                                                       0
## 2                                                                       0
## 3                                                                       0
## 4                                                                       0
## 5                                                                       0
## 6                                                                       0
##   ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK BRAND_DIET MOONLIT
## 1                                                  0                  0
## 2                                                  0                  0
## 3                                                  0                  0
## 4                                                  1                  0
## 5                                                  0                  0
## 6                                                  0                  0
##   BRAND_HILL MOISTURE JUMPIN-FISH BRAND_MOONLIT BRAND_MYTHICAL BEVERAGE
## 1                               0             0                       0
## 2                               0             0                       0
## 3                               0             0                       1
## 4                               0             0                       0
## 5                               0             0                       1
## 6                               0             0                       1
##   BRAND_SUPER-DUPER JUICED BRAND_SUPER-DUPER PUNCHED BRAND_SUPER-DUPER RECOVERY
## 1                        1                         0                          0
## 2                        1                         0                          0
## 3                        0                         0                          0
## 4                        0                         0                          1
## 5                        0                         0                          0
## 6                        0                         0                          0
#create new week of year column

casava <- casava %>%
  mutate(DATE = as.Date(DATE)) %>%
  mutate(WEEK_OF_YEAR = lubridate::week(DATE))

#Drop DATE column
casava$DATE <- NULL
# Summarize the dataset
skimr::skim(casava)
Data summary
Name casava
Number of rows 72455
Number of columns 46
_______________________
Column type frequency:
numeric 46
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CALORIC_SEGMENT 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
UNIT_SALES 0 1 90.66 239.40 1.00 17.00 46.00 99.00 5349.00 ▇▁▁▁▁
DOLLAR_SALES 0 1 179.99 462.52 0.25 30.70 82.93 187.00 11270.01 ▇▁▁▁▁
POP_SQMI 0 1 1384.01 1772.61 0.18 35.12 349.46 2474.41 6769.35 ▇▂▂▁▁
WEEKS_SINCE_LAUNCH 0 1 48.25 31.55 0.00 22.00 43.00 72.00 124.00 ▇▇▆▅▂
16SMALL MULTI CUP 0 1 0.65 0.48 0.00 0.00 1.00 1.00 1.00 ▅▁▁▁▇
20SMALL MULTI JUG 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
16SMALL 24ONE CUP 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
2L MULTI JUG 0 1 0.34 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
SUNSET 0 1 0.56 0.50 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
BLAST 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
JUICED 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
GUAVA 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
RECOVERY 0 1 0.11 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
JACK 0 1 0.11 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
RESERVE 0 1 0.24 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
WHITE 0 1 0.24 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
PITAYA 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ED 0 1 0.30 0.46 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
CASAVA 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
REGION_ARIZONA 0 1 0.21 0.41 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
REGION_CALI_NEVADA 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_COLORADO 0 1 0.12 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_DESERT_SW 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_KANSAS 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_MOUNTAIN 0 1 0.10 0.30 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_NEWMEXICO 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_NOCAL 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_NORTHERN 0 1 0.26 0.44 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
REGION_PRAIRIE 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
REGION_SOCAL 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK 0 1 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_MOONLIT GENTLE DRINK CASAVA 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_MOONLIT GENTLE DRINK SUNSET 0 1 0.33 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 0 1 0.24 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK 0 1 0.11 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_DIET MOONLIT 0 1 0.33 0.47 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
BRAND_HILL MOISTURE JUMPIN-FISH 0 1 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_MOONLIT 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_MYTHICAL BEVERAGE 0 1 0.24 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
BRAND_SUPER-DUPER JUICED 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
BRAND_SUPER-DUPER PUNCHED 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
BRAND_SUPER-DUPER RECOVERY 0 1 0.11 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
WEEK_OF_YEAR 0 1 26.13 15.56 1.00 12.00 26.00 40.00 53.00 ▇▇▆▆▇
#remove top one percent of unit sales to clean up outliers
df <- casava %>% 
  filter(UNIT_SALES < quantile(UNIT_SALES, 0.99))
# Split the data
set.seed(123)
df_testtrn <- initial_split(df, prop = 0.8, strata = UNIT_SALES)
Train <- training(df_testtrn)
Test <- testing(df_testtrn)

# Prepare features and labels for XGBoost
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES

# Convert data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Define XGBoost parameters
set.seed(123)
params <- list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eval_metric = "rmse",
  eta = 0.05,
  max_depth = 4,
  min_child_weight = 3,
  subsample = 0.7,
  colsample_bytree = 0.6,
  lambda = 1,
  alpha = 1
)
# Perform cross-validation to find the optimal number of boosting rounds
cv_results <- xgb.cv(
  params = params,
  data = dtrain,  
  nfold = 5,
  nrounds = 500,  # Changed from 'num_boost_round' to 'nrounds'
  early_stopping_rounds = 10,
  metrics = "rmse",
  seed = 123
)
## [1]  train-rmse:102.689882+0.780233  test-rmse:102.712290+1.295163 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [2]  train-rmse:98.702083+1.363531   test-rmse:98.740296+2.023720 
## [3]  train-rmse:94.937884+1.580653   test-rmse:94.981742+2.173403 
## [4]  train-rmse:90.674844+1.119582   test-rmse:90.711874+1.687706 
## [5]  train-rmse:86.619910+0.943238   test-rmse:86.649333+1.367550 
## [6]  train-rmse:82.755484+0.656333   test-rmse:82.774541+0.939479 
## [7]  train-rmse:79.675260+1.132207   test-rmse:79.700101+1.358782 
## [8]  train-rmse:76.759897+1.470844   test-rmse:76.800500+1.900275 
## [9]  train-rmse:73.416355+1.358047   test-rmse:73.457145+1.890681 
## [10] train-rmse:71.052956+1.462888   test-rmse:71.092276+1.889188 
## [11] train-rmse:68.569834+2.035293   test-rmse:68.606878+2.380012 
## [12] train-rmse:65.591955+1.787084   test-rmse:65.631687+2.206648 
## [13] train-rmse:63.075809+2.206655   test-rmse:63.116145+2.508841 
## [14] train-rmse:61.203811+2.423265   test-rmse:61.242254+2.661690 
## [15] train-rmse:59.461200+2.777820   test-rmse:59.503389+3.034561 
## [16] train-rmse:57.778471+2.806387   test-rmse:57.829825+3.126994 
## [17] train-rmse:55.921283+2.699465   test-rmse:55.968870+3.013889 
## [18] train-rmse:54.212742+3.096257   test-rmse:54.271128+3.461112 
## [19] train-rmse:52.760410+2.788335   test-rmse:52.821160+3.137513 
## [20] train-rmse:51.172859+2.955435   test-rmse:51.237998+3.294276 
## [21] train-rmse:49.982374+3.337959   test-rmse:50.049419+3.680094 
## [22] train-rmse:47.991008+3.162963   test-rmse:48.057874+3.449763 
## [23] train-rmse:46.821312+2.985901   test-rmse:46.892708+3.269121 
## [24] train-rmse:45.052367+3.256110   test-rmse:45.130489+3.537563 
## [25] train-rmse:44.014604+3.194726   test-rmse:44.097792+3.473161 
## [26] train-rmse:42.572573+3.071974   test-rmse:42.652957+3.375914 
## [27] train-rmse:41.222055+3.087495   test-rmse:41.301575+3.326996 
## [28] train-rmse:39.453078+2.946359   test-rmse:39.532408+3.174277 
## [29] train-rmse:38.008798+2.900296   test-rmse:38.093286+3.076730 
## [30] train-rmse:36.784878+2.654663   test-rmse:36.872072+2.833746 
## [31] train-rmse:35.814742+2.424238   test-rmse:35.892399+2.624422 
## [32] train-rmse:34.476246+2.070291   test-rmse:34.564947+2.273299 
## [33] train-rmse:33.417028+1.938389   test-rmse:33.499379+2.155314 
## [34] train-rmse:32.418829+1.796358   test-rmse:32.510437+1.999621 
## [35] train-rmse:31.444095+1.426164   test-rmse:31.544917+1.670367 
## [36] train-rmse:30.358567+1.108329   test-rmse:30.456026+1.351819 
## [37] train-rmse:29.682774+1.402499   test-rmse:29.793692+1.666687 
## [38] train-rmse:28.879992+1.526556   test-rmse:29.001566+1.756459 
## [39] train-rmse:28.243808+1.357528   test-rmse:28.370230+1.612884 
## [40] train-rmse:27.324583+1.094793   test-rmse:27.452676+1.351420 
## [41] train-rmse:26.604741+1.038062   test-rmse:26.735600+1.278487 
## [42] train-rmse:26.070942+1.080355   test-rmse:26.203319+1.319672 
## [43] train-rmse:25.537087+1.000423   test-rmse:25.674606+1.218219 
## [44] train-rmse:25.027773+0.697903   test-rmse:25.170768+0.924103 
## [45] train-rmse:24.288867+0.693500   test-rmse:24.439479+0.947837 
## [46] train-rmse:23.594417+0.776504   test-rmse:23.757349+1.035764 
## [47] train-rmse:23.050716+0.877753   test-rmse:23.215963+1.102088 
## [48] train-rmse:22.549829+1.089395   test-rmse:22.721535+1.328736 
## [49] train-rmse:22.030301+1.038642   test-rmse:22.204668+1.242223 
## [50] train-rmse:21.546810+1.067016   test-rmse:21.727535+1.225743 
## [51] train-rmse:21.304801+1.041195   test-rmse:21.486511+1.238556 
## [52] train-rmse:20.972408+1.137462   test-rmse:21.160810+1.297066 
## [53] train-rmse:20.555603+1.242438   test-rmse:20.750969+1.425080 
## [54] train-rmse:20.336318+1.258458   test-rmse:20.532419+1.456648 
## [55] train-rmse:20.139963+1.303300   test-rmse:20.339343+1.511819 
## [56] train-rmse:19.641997+1.215082   test-rmse:19.845562+1.409409 
## [57] train-rmse:19.373971+1.286251   test-rmse:19.582635+1.451991 
## [58] train-rmse:19.093356+1.154237   test-rmse:19.297180+1.322007 
## [59] train-rmse:18.664548+1.130235   test-rmse:18.867352+1.266556 
## [60] train-rmse:18.356747+1.190502   test-rmse:18.565554+1.305706 
## [61] train-rmse:18.058009+1.266767   test-rmse:18.269644+1.361181 
## [62] train-rmse:17.763066+1.304833   test-rmse:17.976448+1.388284 
## [63] train-rmse:17.392349+1.291859   test-rmse:17.611901+1.391164 
## [64] train-rmse:17.132474+1.346762   test-rmse:17.355011+1.465197 
## [65] train-rmse:16.802423+1.343859   test-rmse:17.029047+1.444147 
## [66] train-rmse:16.483817+1.348208   test-rmse:16.713843+1.460448 
## [67] train-rmse:16.168683+1.273952   test-rmse:16.402159+1.384061 
## [68] train-rmse:15.981736+1.174512   test-rmse:16.222420+1.289637 
## [69] train-rmse:15.767996+1.232178   test-rmse:16.010705+1.360615 
## [70] train-rmse:15.549390+1.171009   test-rmse:15.789847+1.317939 
## [71] train-rmse:15.419484+1.252094   test-rmse:15.660256+1.395755 
## [72] train-rmse:15.178119+1.240006   test-rmse:15.423660+1.381931 
## [73] train-rmse:15.024933+1.169310   test-rmse:15.269806+1.309344 
## [74] train-rmse:14.744385+1.115933   test-rmse:14.989075+1.253997 
## [75] train-rmse:14.554456+1.027026   test-rmse:14.796697+1.160290 
## [76] train-rmse:14.468998+0.980994   test-rmse:14.715988+1.101812 
## [77] train-rmse:14.302921+0.954728   test-rmse:14.550246+1.054157 
## [78] train-rmse:14.207751+1.011058   test-rmse:14.459476+1.110396 
## [79] train-rmse:13.993363+0.921596   test-rmse:14.249039+1.026188 
## [80] train-rmse:13.885177+0.908088   test-rmse:14.142837+1.002259 
## [81] train-rmse:13.707482+0.907199   test-rmse:13.972268+0.988394 
## [82] train-rmse:13.573616+0.871751   test-rmse:13.841192+0.960323 
## [83] train-rmse:13.401598+0.826557   test-rmse:13.671468+0.913425 
## [84] train-rmse:13.238998+0.787131   test-rmse:13.511245+0.867230 
## [85] train-rmse:13.053808+0.756525   test-rmse:13.330726+0.834952 
## [86] train-rmse:12.927942+0.688835   test-rmse:13.208439+0.770070 
## [87] train-rmse:12.883840+0.725243   test-rmse:13.166493+0.801005 
## [88] train-rmse:12.783741+0.731483   test-rmse:13.073882+0.800239 
## [89] train-rmse:12.724299+0.718740   test-rmse:13.015344+0.796116 
## [90] train-rmse:12.636796+0.739325   test-rmse:12.928852+0.804918 
## [91] train-rmse:12.544169+0.724980   test-rmse:12.832758+0.789029 
## [92] train-rmse:12.400237+0.694019   test-rmse:12.688413+0.760329 
## [93] train-rmse:12.299069+0.640253   test-rmse:12.585575+0.703267 
## [94] train-rmse:12.208106+0.598368   test-rmse:12.493493+0.660656 
## [95] train-rmse:12.079379+0.579635   test-rmse:12.366643+0.642090 
## [96] train-rmse:11.963787+0.560759   test-rmse:12.250263+0.622924 
## [97] train-rmse:11.893249+0.573860   test-rmse:12.180747+0.645342 
## [98] train-rmse:11.862918+0.576236   test-rmse:12.148935+0.653339 
## [99] train-rmse:11.783112+0.588486   test-rmse:12.069667+0.659628 
## [100]    train-rmse:11.695365+0.573048   test-rmse:11.981367+0.654339 
## [101]    train-rmse:11.635983+0.593495   test-rmse:11.922046+0.680130 
## [102]    train-rmse:11.579567+0.611677   test-rmse:11.866863+0.703926 
## [103]    train-rmse:11.484293+0.586886   test-rmse:11.772220+0.679067 
## [104]    train-rmse:11.420143+0.565122   test-rmse:11.710143+0.655879 
## [105]    train-rmse:11.365886+0.571852   test-rmse:11.659728+0.659930 
## [106]    train-rmse:11.295451+0.550707   test-rmse:11.589531+0.637279 
## [107]    train-rmse:11.244492+0.523744   test-rmse:11.540145+0.619710 
## [108]    train-rmse:11.197286+0.536051   test-rmse:11.497243+0.626526 
## [109]    train-rmse:11.140789+0.510071   test-rmse:11.443049+0.600816 
## [110]    train-rmse:11.089424+0.501646   test-rmse:11.390910+0.596461 
## [111]    train-rmse:11.029752+0.489832   test-rmse:11.332292+0.581800 
## [112]    train-rmse:10.978429+0.466366   test-rmse:11.281589+0.571699 
## [113]    train-rmse:10.960257+0.476238   test-rmse:11.263753+0.580611 
## [114]    train-rmse:10.916419+0.478981   test-rmse:11.221794+0.575503 
## [115]    train-rmse:10.869904+0.454041   test-rmse:11.175240+0.549511 
## [116]    train-rmse:10.817713+0.442914   test-rmse:11.124262+0.535468 
## [117]    train-rmse:10.758573+0.430308   test-rmse:11.065993+0.526622 
## [118]    train-rmse:10.708887+0.422436   test-rmse:11.016220+0.529604 
## [119]    train-rmse:10.656801+0.396630   test-rmse:10.964880+0.509991 
## [120]    train-rmse:10.604107+0.387217   test-rmse:10.909600+0.499444 
## [121]    train-rmse:10.571951+0.387818   test-rmse:10.878389+0.501600 
## [122]    train-rmse:10.548940+0.395535   test-rmse:10.857190+0.509593 
## [123]    train-rmse:10.508228+0.378576   test-rmse:10.818514+0.497348 
## [124]    train-rmse:10.461029+0.369023   test-rmse:10.773970+0.488230 
## [125]    train-rmse:10.438132+0.366194   test-rmse:10.752078+0.491521 
## [126]    train-rmse:10.405665+0.362311   test-rmse:10.720880+0.481806 
## [127]    train-rmse:10.372919+0.347835   test-rmse:10.687829+0.468087 
## [128]    train-rmse:10.338219+0.343580   test-rmse:10.654994+0.471222 
## [129]    train-rmse:10.302982+0.330495   test-rmse:10.618090+0.459364 
## [130]    train-rmse:10.268150+0.318149   test-rmse:10.582318+0.452690 
## [131]    train-rmse:10.239830+0.304422   test-rmse:10.553416+0.435535 
## [132]    train-rmse:10.214925+0.294318   test-rmse:10.528403+0.427561 
## [133]    train-rmse:10.187567+0.283916   test-rmse:10.499851+0.418485 
## [134]    train-rmse:10.162352+0.285777   test-rmse:10.472926+0.408416 
## [135]    train-rmse:10.138565+0.290017   test-rmse:10.449473+0.416144 
## [136]    train-rmse:10.110289+0.289661   test-rmse:10.420505+0.420958 
## [137]    train-rmse:10.080557+0.283655   test-rmse:10.391616+0.410724 
## [138]    train-rmse:10.052450+0.272125   test-rmse:10.364044+0.402295 
## [139]    train-rmse:10.030154+0.268143   test-rmse:10.343941+0.395366 
## [140]    train-rmse:10.003584+0.270149   test-rmse:10.314946+0.389151 
## [141]    train-rmse:9.981108+0.272146    test-rmse:10.292370+0.392469 
## [142]    train-rmse:9.968613+0.270443    test-rmse:10.280058+0.393368 
## [143]    train-rmse:9.954507+0.272755    test-rmse:10.266737+0.395634 
## [144]    train-rmse:9.945733+0.272524    test-rmse:10.257521+0.394620 
## [145]    train-rmse:9.928398+0.268431    test-rmse:10.240430+0.386672 
## [146]    train-rmse:9.909482+0.269592    test-rmse:10.221476+0.390443 
## [147]    train-rmse:9.888770+0.273547    test-rmse:10.200798+0.392274 
## [148]    train-rmse:9.868002+0.272527    test-rmse:10.180335+0.393892 
## [149]    train-rmse:9.854154+0.274366    test-rmse:10.166357+0.394305 
## [150]    train-rmse:9.840402+0.269503    test-rmse:10.150983+0.391375 
## [151]    train-rmse:9.817040+0.269112    test-rmse:10.127662+0.394175 
## [152]    train-rmse:9.804457+0.270549    test-rmse:10.115482+0.395990 
## [153]    train-rmse:9.791049+0.267749    test-rmse:10.103111+0.396191 
## [154]    train-rmse:9.768323+0.259712    test-rmse:10.079521+0.386205 
## [155]    train-rmse:9.743681+0.257151    test-rmse:10.055079+0.384576 
## [156]    train-rmse:9.726397+0.256747    test-rmse:10.038817+0.382729 
## [157]    train-rmse:9.711468+0.260261    test-rmse:10.023997+0.388367 
## [158]    train-rmse:9.696066+0.262473    test-rmse:10.010005+0.388253 
## [159]    train-rmse:9.678501+0.258117    test-rmse:9.990132+0.380896 
## [160]    train-rmse:9.667259+0.258058    test-rmse:9.979350+0.380847 
## [161]    train-rmse:9.650305+0.255550    test-rmse:9.962040+0.374532 
## [162]    train-rmse:9.643441+0.254026    test-rmse:9.955491+0.371921 
## [163]    train-rmse:9.624506+0.254014    test-rmse:9.938577+0.373106 
## [164]    train-rmse:9.610376+0.250247    test-rmse:9.924525+0.372352 
## [165]    train-rmse:9.597436+0.249916    test-rmse:9.911904+0.372420 
## [166]    train-rmse:9.583782+0.246253    test-rmse:9.899458+0.369973 
## [167]    train-rmse:9.568854+0.242033    test-rmse:9.885391+0.370372 
## [168]    train-rmse:9.559104+0.239018    test-rmse:9.874243+0.368307 
## [169]    train-rmse:9.538059+0.234444    test-rmse:9.853492+0.366974 
## [170]    train-rmse:9.522909+0.233828    test-rmse:9.839547+0.370557 
## [171]    train-rmse:9.508664+0.230178    test-rmse:9.824994+0.362440 
## [172]    train-rmse:9.498856+0.229309    test-rmse:9.816595+0.362519 
## [173]    train-rmse:9.490717+0.228929    test-rmse:9.808746+0.362582 
## [174]    train-rmse:9.479309+0.232886    test-rmse:9.797744+0.366530 
## [175]    train-rmse:9.460476+0.227874    test-rmse:9.780552+0.363627 
## [176]    train-rmse:9.443375+0.223134    test-rmse:9.763211+0.359304 
## [177]    train-rmse:9.431810+0.225680    test-rmse:9.752993+0.361286 
## [178]    train-rmse:9.422489+0.224588    test-rmse:9.744180+0.360350 
## [179]    train-rmse:9.408737+0.219965    test-rmse:9.731364+0.354629 
## [180]    train-rmse:9.398624+0.221871    test-rmse:9.720717+0.357535 
## [181]    train-rmse:9.388951+0.221443    test-rmse:9.710355+0.352998 
## [182]    train-rmse:9.376351+0.217778    test-rmse:9.696782+0.350414 
## [183]    train-rmse:9.368590+0.217224    test-rmse:9.689716+0.351356 
## [184]    train-rmse:9.352060+0.213476    test-rmse:9.670973+0.342864 
## [185]    train-rmse:9.341866+0.215791    test-rmse:9.661568+0.344894 
## [186]    train-rmse:9.331442+0.212573    test-rmse:9.651487+0.344799 
## [187]    train-rmse:9.319191+0.209051    test-rmse:9.638959+0.342304 
## [188]    train-rmse:9.307088+0.207991    test-rmse:9.626953+0.343549 
## [189]    train-rmse:9.298179+0.206607    test-rmse:9.618282+0.345937 
## [190]    train-rmse:9.289605+0.207347    test-rmse:9.610133+0.347884 
## [191]    train-rmse:9.281917+0.205572    test-rmse:9.603140+0.347166 
## [192]    train-rmse:9.273835+0.205125    test-rmse:9.595968+0.346556 
## [193]    train-rmse:9.264224+0.204798    test-rmse:9.588140+0.345883 
## [194]    train-rmse:9.253228+0.205418    test-rmse:9.577372+0.348348 
## [195]    train-rmse:9.241636+0.205678    test-rmse:9.567641+0.346881 
## [196]    train-rmse:9.229632+0.202709    test-rmse:9.556858+0.348469 
## [197]    train-rmse:9.221069+0.200158    test-rmse:9.548216+0.346199 
## [198]    train-rmse:9.207564+0.194758    test-rmse:9.535643+0.341696 
## [199]    train-rmse:9.196649+0.194726    test-rmse:9.524671+0.337533 
## [200]    train-rmse:9.191955+0.193856    test-rmse:9.520129+0.337843 
## [201]    train-rmse:9.179259+0.189434    test-rmse:9.507863+0.334176 
## [202]    train-rmse:9.169101+0.187974    test-rmse:9.497131+0.329239 
## [203]    train-rmse:9.153462+0.186197    test-rmse:9.481736+0.328483 
## [204]    train-rmse:9.144964+0.186667    test-rmse:9.473858+0.328080 
## [205]    train-rmse:9.138452+0.187495    test-rmse:9.468884+0.330553 
## [206]    train-rmse:9.132580+0.187066    test-rmse:9.463667+0.330729 
## [207]    train-rmse:9.125553+0.184862    test-rmse:9.456194+0.323541 
## [208]    train-rmse:9.119051+0.185645    test-rmse:9.449886+0.326456 
## [209]    train-rmse:9.111306+0.185289    test-rmse:9.442230+0.326988 
## [210]    train-rmse:9.102033+0.183870    test-rmse:9.433643+0.325617 
## [211]    train-rmse:9.096477+0.183626    test-rmse:9.428649+0.325943 
## [212]    train-rmse:9.085712+0.184233    test-rmse:9.419220+0.326119 
## [213]    train-rmse:9.079890+0.182994    test-rmse:9.413699+0.323949 
## [214]    train-rmse:9.071769+0.181029    test-rmse:9.406076+0.321504 
## [215]    train-rmse:9.061844+0.179566    test-rmse:9.396172+0.319577 
## [216]    train-rmse:9.050595+0.177879    test-rmse:9.385558+0.317910 
## [217]    train-rmse:9.040974+0.175041    test-rmse:9.377395+0.319038 
## [218]    train-rmse:9.034094+0.173564    test-rmse:9.370442+0.318016 
## [219]    train-rmse:9.025810+0.170800    test-rmse:9.362142+0.314908 
## [220]    train-rmse:9.020363+0.169165    test-rmse:9.356656+0.313846 
## [221]    train-rmse:9.012264+0.166060    test-rmse:9.349887+0.314868 
## [222]    train-rmse:9.005342+0.165913    test-rmse:9.343455+0.313804 
## [223]    train-rmse:8.998773+0.165329    test-rmse:9.336193+0.315482 
## [224]    train-rmse:8.990972+0.163964    test-rmse:9.329155+0.313307 
## [225]    train-rmse:8.985557+0.163477    test-rmse:9.324181+0.314096 
## [226]    train-rmse:8.979931+0.165863    test-rmse:9.318450+0.316042 
## [227]    train-rmse:8.969303+0.163417    test-rmse:9.307984+0.315869 
## [228]    train-rmse:8.961827+0.162184    test-rmse:9.300960+0.316621 
## [229]    train-rmse:8.954631+0.160719    test-rmse:9.294833+0.315701 
## [230]    train-rmse:8.947914+0.159064    test-rmse:9.287804+0.310467 
## [231]    train-rmse:8.941145+0.159324    test-rmse:9.282284+0.307436 
## [232]    train-rmse:8.932537+0.156024    test-rmse:9.273848+0.305573 
## [233]    train-rmse:8.925758+0.155045    test-rmse:9.268126+0.305893 
## [234]    train-rmse:8.919624+0.153138    test-rmse:9.262503+0.306788 
## [235]    train-rmse:8.912716+0.150354    test-rmse:9.254581+0.302972 
## [236]    train-rmse:8.907617+0.151331    test-rmse:9.249833+0.302216 
## [237]    train-rmse:8.902585+0.150090    test-rmse:9.244779+0.301269 
## [238]    train-rmse:8.897156+0.148726    test-rmse:9.239674+0.300181 
## [239]    train-rmse:8.890984+0.149326    test-rmse:9.233980+0.301389 
## [240]    train-rmse:8.884421+0.147570    test-rmse:9.227653+0.302575 
## [241]    train-rmse:8.880146+0.147908    test-rmse:9.223793+0.303488 
## [242]    train-rmse:8.873267+0.145659    test-rmse:9.216839+0.304004 
## [243]    train-rmse:8.867021+0.144497    test-rmse:9.210371+0.299925 
## [244]    train-rmse:8.860797+0.142364    test-rmse:9.205169+0.298167 
## [245]    train-rmse:8.854486+0.143742    test-rmse:9.198919+0.299736 
## [246]    train-rmse:8.846043+0.144562    test-rmse:9.191826+0.300122 
## [247]    train-rmse:8.838592+0.144308    test-rmse:9.185778+0.299641 
## [248]    train-rmse:8.829448+0.140770    test-rmse:9.176742+0.296749 
## [249]    train-rmse:8.825030+0.140110    test-rmse:9.172775+0.296628 
## [250]    train-rmse:8.819097+0.140169    test-rmse:9.167867+0.297909 
## [251]    train-rmse:8.813894+0.138937    test-rmse:9.163669+0.297420 
## [252]    train-rmse:8.809403+0.137574    test-rmse:9.159674+0.297060 
## [253]    train-rmse:8.803683+0.135299    test-rmse:9.154742+0.298260 
## [254]    train-rmse:8.798818+0.134952    test-rmse:9.150083+0.297477 
## [255]    train-rmse:8.793535+0.134317    test-rmse:9.144977+0.294710 
## [256]    train-rmse:8.787952+0.131667    test-rmse:9.139872+0.293806 
## [257]    train-rmse:8.781135+0.130991    test-rmse:9.133284+0.295634 
## [258]    train-rmse:8.777143+0.129929    test-rmse:9.129867+0.295065 
## [259]    train-rmse:8.769674+0.130471    test-rmse:9.123098+0.297029 
## [260]    train-rmse:8.763330+0.130407    test-rmse:9.116956+0.297072 
## [261]    train-rmse:8.756798+0.129322    test-rmse:9.109881+0.294106 
## [262]    train-rmse:8.752204+0.128367    test-rmse:9.105869+0.292401 
## [263]    train-rmse:8.745731+0.127203    test-rmse:9.099680+0.290743 
## [264]    train-rmse:8.739232+0.127818    test-rmse:9.094262+0.292436 
## [265]    train-rmse:8.734261+0.126795    test-rmse:9.089100+0.294061 
## [266]    train-rmse:8.729095+0.126803    test-rmse:9.084026+0.294060 
## [267]    train-rmse:8.723178+0.126670    test-rmse:9.077963+0.295944 
## [268]    train-rmse:8.717808+0.124584    test-rmse:9.072453+0.294002 
## [269]    train-rmse:8.712367+0.123935    test-rmse:9.067154+0.295312 
## [270]    train-rmse:8.706577+0.123199    test-rmse:9.060968+0.295027 
## [271]    train-rmse:8.699446+0.123875    test-rmse:9.054215+0.294961 
## [272]    train-rmse:8.692744+0.122394    test-rmse:9.048680+0.293942 
## [273]    train-rmse:8.686876+0.122380    test-rmse:9.043430+0.293734 
## [274]    train-rmse:8.681089+0.123495    test-rmse:9.037688+0.295458 
## [275]    train-rmse:8.676207+0.123389    test-rmse:9.032897+0.295966 
## [276]    train-rmse:8.672217+0.123867    test-rmse:9.029221+0.297095 
## [277]    train-rmse:8.666440+0.124048    test-rmse:9.023883+0.295691 
## [278]    train-rmse:8.662211+0.123530    test-rmse:9.019612+0.295833 
## [279]    train-rmse:8.657963+0.122558    test-rmse:9.015586+0.296335 
## [280]    train-rmse:8.651971+0.120592    test-rmse:9.010602+0.295805 
## [281]    train-rmse:8.648824+0.119552    test-rmse:9.008077+0.295589 
## [282]    train-rmse:8.640981+0.118904    test-rmse:9.000690+0.291440 
## [283]    train-rmse:8.636444+0.119869    test-rmse:8.996280+0.290680 
## [284]    train-rmse:8.632246+0.121019    test-rmse:8.991760+0.290731 
## [285]    train-rmse:8.626691+0.121583    test-rmse:8.986453+0.288197 
## [286]    train-rmse:8.623153+0.121773    test-rmse:8.983662+0.289596 
## [287]    train-rmse:8.618866+0.120909    test-rmse:8.979897+0.290130 
## [288]    train-rmse:8.614476+0.121727    test-rmse:8.975920+0.289932 
## [289]    train-rmse:8.609075+0.124274    test-rmse:8.970103+0.290498 
## [290]    train-rmse:8.603500+0.122092    test-rmse:8.965922+0.292216 
## [291]    train-rmse:8.598647+0.120814    test-rmse:8.960602+0.288102 
## [292]    train-rmse:8.592922+0.119429    test-rmse:8.954556+0.289028 
## [293]    train-rmse:8.587861+0.118669    test-rmse:8.949981+0.287435 
## [294]    train-rmse:8.582219+0.118403    test-rmse:8.944576+0.289784 
## [295]    train-rmse:8.578663+0.118312    test-rmse:8.941169+0.288191 
## [296]    train-rmse:8.573927+0.117161    test-rmse:8.937105+0.288778 
## [297]    train-rmse:8.569051+0.115084    test-rmse:8.932944+0.289161 
## [298]    train-rmse:8.564935+0.116027    test-rmse:8.929085+0.290867 
## [299]    train-rmse:8.559643+0.114148    test-rmse:8.924736+0.290040 
## [300]    train-rmse:8.557081+0.114107    test-rmse:8.922411+0.289541 
## [301]    train-rmse:8.552046+0.113961    test-rmse:8.917877+0.289033 
## [302]    train-rmse:8.546947+0.113548    test-rmse:8.913953+0.289977 
## [303]    train-rmse:8.542004+0.113007    test-rmse:8.909105+0.290286 
## [304]    train-rmse:8.534340+0.113259    test-rmse:8.902345+0.289642 
## [305]    train-rmse:8.530323+0.113360    test-rmse:8.898568+0.291441 
## [306]    train-rmse:8.524599+0.112977    test-rmse:8.893919+0.290491 
## [307]    train-rmse:8.519382+0.112010    test-rmse:8.889378+0.288221 
## [308]    train-rmse:8.515482+0.111810    test-rmse:8.886067+0.288743 
## [309]    train-rmse:8.511361+0.113562    test-rmse:8.882429+0.289770 
## [310]    train-rmse:8.507339+0.111798    test-rmse:8.878886+0.289247 
## [311]    train-rmse:8.504201+0.112198    test-rmse:8.877127+0.291976 
## [312]    train-rmse:8.500366+0.112488    test-rmse:8.873144+0.290968 
## [313]    train-rmse:8.495998+0.111841    test-rmse:8.869097+0.291137 
## [314]    train-rmse:8.491633+0.112011    test-rmse:8.865526+0.290485 
## [315]    train-rmse:8.488577+0.112059    test-rmse:8.862831+0.289537 
## [316]    train-rmse:8.484472+0.113356    test-rmse:8.858373+0.288132 
## [317]    train-rmse:8.481425+0.113585    test-rmse:8.855268+0.287982 
## [318]    train-rmse:8.476089+0.113091    test-rmse:8.849023+0.284105 
## [319]    train-rmse:8.471962+0.114105    test-rmse:8.845141+0.284487 
## [320]    train-rmse:8.466069+0.113788    test-rmse:8.839102+0.283357 
## [321]    train-rmse:8.461702+0.114239    test-rmse:8.835062+0.285226 
## [322]    train-rmse:8.458019+0.116074    test-rmse:8.831525+0.286963 
## [323]    train-rmse:8.454229+0.115193    test-rmse:8.828337+0.286917 
## [324]    train-rmse:8.450663+0.115165    test-rmse:8.826303+0.288908 
## [325]    train-rmse:8.445919+0.114698    test-rmse:8.822217+0.286673 
## [326]    train-rmse:8.442293+0.114011    test-rmse:8.818598+0.286938 
## [327]    train-rmse:8.438979+0.113680    test-rmse:8.815336+0.287565 
## [328]    train-rmse:8.434216+0.112974    test-rmse:8.811775+0.287072 
## [329]    train-rmse:8.429879+0.113630    test-rmse:8.807816+0.285661 
## [330]    train-rmse:8.425727+0.113823    test-rmse:8.804281+0.286277 
## [331]    train-rmse:8.421887+0.113541    test-rmse:8.800897+0.284239 
## [332]    train-rmse:8.417911+0.114209    test-rmse:8.798651+0.283248 
## [333]    train-rmse:8.413847+0.113526    test-rmse:8.794633+0.284324 
## [334]    train-rmse:8.411417+0.113605    test-rmse:8.792217+0.284816 
## [335]    train-rmse:8.407450+0.114286    test-rmse:8.788558+0.283517 
## [336]    train-rmse:8.403794+0.114789    test-rmse:8.785336+0.284982 
## [337]    train-rmse:8.398787+0.117646    test-rmse:8.780856+0.284771 
## [338]    train-rmse:8.396425+0.117797    test-rmse:8.779326+0.285043 
## [339]    train-rmse:8.392445+0.117506    test-rmse:8.775160+0.280358 
## [340]    train-rmse:8.386736+0.118151    test-rmse:8.770392+0.280905 
## [341]    train-rmse:8.383048+0.118023    test-rmse:8.767437+0.279058 
## [342]    train-rmse:8.379728+0.117573    test-rmse:8.765095+0.279662 
## [343]    train-rmse:8.375833+0.117491    test-rmse:8.762383+0.280367 
## [344]    train-rmse:8.372060+0.116695    test-rmse:8.758993+0.280416 
## [345]    train-rmse:8.366936+0.116869    test-rmse:8.754642+0.281481 
## [346]    train-rmse:8.361933+0.115284    test-rmse:8.750303+0.279365 
## [347]    train-rmse:8.357452+0.116801    test-rmse:8.745061+0.280778 
## [348]    train-rmse:8.354712+0.115382    test-rmse:8.742765+0.280844 
## [349]    train-rmse:8.352015+0.116136    test-rmse:8.739906+0.282051 
## [350]    train-rmse:8.346868+0.114409    test-rmse:8.736365+0.281574 
## [351]    train-rmse:8.342710+0.113397    test-rmse:8.732858+0.281329 
## [352]    train-rmse:8.337940+0.111874    test-rmse:8.729558+0.282076 
## [353]    train-rmse:8.335345+0.111796    test-rmse:8.727525+0.281774 
## [354]    train-rmse:8.331049+0.113244    test-rmse:8.723280+0.280515 
## [355]    train-rmse:8.326952+0.112691    test-rmse:8.719346+0.279671 
## [356]    train-rmse:8.323652+0.111643    test-rmse:8.716547+0.278764 
## [357]    train-rmse:8.319223+0.112060    test-rmse:8.712922+0.276219 
## [358]    train-rmse:8.315540+0.112462    test-rmse:8.709384+0.276912 
## [359]    train-rmse:8.311421+0.111450    test-rmse:8.705603+0.276602 
## [360]    train-rmse:8.308662+0.110867    test-rmse:8.703212+0.276282 
## [361]    train-rmse:8.306142+0.109864    test-rmse:8.701069+0.276283 
## [362]    train-rmse:8.302051+0.110345    test-rmse:8.697326+0.275360 
## [363]    train-rmse:8.299490+0.110271    test-rmse:8.695999+0.275246 
## [364]    train-rmse:8.294761+0.109359    test-rmse:8.692178+0.273835 
## [365]    train-rmse:8.289644+0.110095    test-rmse:8.687572+0.273107 
## [366]    train-rmse:8.286680+0.109957    test-rmse:8.684767+0.272776 
## [367]    train-rmse:8.282881+0.111044    test-rmse:8.681073+0.272865 
## [368]    train-rmse:8.279471+0.111701    test-rmse:8.678821+0.272302 
## [369]    train-rmse:8.276706+0.110443    test-rmse:8.676928+0.272265 
## [370]    train-rmse:8.272687+0.109785    test-rmse:8.673238+0.271694 
## [371]    train-rmse:8.269869+0.108466    test-rmse:8.671549+0.272235 
## [372]    train-rmse:8.267546+0.108246    test-rmse:8.669678+0.272108 
## [373]    train-rmse:8.262809+0.108325    test-rmse:8.665331+0.273371 
## [374]    train-rmse:8.260179+0.108811    test-rmse:8.663379+0.273604 
## [375]    train-rmse:8.256976+0.108769    test-rmse:8.660668+0.273304 
## [376]    train-rmse:8.253582+0.108666    test-rmse:8.657538+0.273504 
## [377]    train-rmse:8.250138+0.107979    test-rmse:8.654788+0.275059 
## [378]    train-rmse:8.246836+0.108530    test-rmse:8.651400+0.274510 
## [379]    train-rmse:8.244481+0.108627    test-rmse:8.649024+0.275649 
## [380]    train-rmse:8.239697+0.111188    test-rmse:8.645103+0.274424 
## [381]    train-rmse:8.237070+0.110921    test-rmse:8.642551+0.273709 
## [382]    train-rmse:8.233694+0.110977    test-rmse:8.640528+0.273643 
## [383]    train-rmse:8.229069+0.108992    test-rmse:8.635601+0.274816 
## [384]    train-rmse:8.224621+0.108151    test-rmse:8.631816+0.275402 
## [385]    train-rmse:8.220769+0.108071    test-rmse:8.628644+0.276042 
## [386]    train-rmse:8.216255+0.106751    test-rmse:8.624903+0.276680 
## [387]    train-rmse:8.211975+0.106718    test-rmse:8.620802+0.275735 
## [388]    train-rmse:8.207489+0.106179    test-rmse:8.616847+0.274495 
## [389]    train-rmse:8.202949+0.105573    test-rmse:8.612800+0.273785 
## [390]    train-rmse:8.199502+0.106011    test-rmse:8.609369+0.271596 
## [391]    train-rmse:8.194150+0.104656    test-rmse:8.604937+0.271292 
## [392]    train-rmse:8.191524+0.104844    test-rmse:8.602938+0.271049 
## [393]    train-rmse:8.186738+0.104256    test-rmse:8.598852+0.271595 
## [394]    train-rmse:8.182681+0.102268    test-rmse:8.595140+0.272974 
## [395]    train-rmse:8.180128+0.101560    test-rmse:8.592829+0.273328 
## [396]    train-rmse:8.175660+0.101261    test-rmse:8.589334+0.271899 
## [397]    train-rmse:8.174261+0.101533    test-rmse:8.587933+0.271778 
## [398]    train-rmse:8.171317+0.100060    test-rmse:8.585193+0.271646 
## [399]    train-rmse:8.168709+0.099048    test-rmse:8.583225+0.271773 
## [400]    train-rmse:8.166076+0.097906    test-rmse:8.581148+0.272330 
## [401]    train-rmse:8.163575+0.098403    test-rmse:8.578954+0.272782 
## [402]    train-rmse:8.158012+0.097720    test-rmse:8.574222+0.274629 
## [403]    train-rmse:8.154116+0.096133    test-rmse:8.570826+0.274974 
## [404]    train-rmse:8.150174+0.095481    test-rmse:8.567481+0.276702 
## [405]    train-rmse:8.147309+0.095743    test-rmse:8.564858+0.275559 
## [406]    train-rmse:8.144409+0.094807    test-rmse:8.562187+0.274224 
## [407]    train-rmse:8.140903+0.094755    test-rmse:8.559598+0.273337 
## [408]    train-rmse:8.137098+0.094581    test-rmse:8.556940+0.273858 
## [409]    train-rmse:8.134405+0.094077    test-rmse:8.554519+0.273619 
## [410]    train-rmse:8.132254+0.092561    test-rmse:8.552679+0.274133 
## [411]    train-rmse:8.129830+0.093374    test-rmse:8.550844+0.274591 
## [412]    train-rmse:8.128153+0.093541    test-rmse:8.549533+0.275026 
## [413]    train-rmse:8.124656+0.092838    test-rmse:8.546348+0.274941 
## [414]    train-rmse:8.121510+0.093191    test-rmse:8.543754+0.273746 
## [415]    train-rmse:8.117702+0.092869    test-rmse:8.540696+0.274347 
## [416]    train-rmse:8.115480+0.092273    test-rmse:8.539314+0.274346 
## [417]    train-rmse:8.112507+0.091522    test-rmse:8.536267+0.275698 
## [418]    train-rmse:8.110397+0.091746    test-rmse:8.534393+0.276187 
## [419]    train-rmse:8.107155+0.090992    test-rmse:8.532109+0.275996 
## [420]    train-rmse:8.104655+0.090915    test-rmse:8.530622+0.275597 
## [421]    train-rmse:8.101663+0.091469    test-rmse:8.527582+0.274279 
## [422]    train-rmse:8.098822+0.090770    test-rmse:8.525194+0.274080 
## [423]    train-rmse:8.095387+0.090274    test-rmse:8.523229+0.274242 
## [424]    train-rmse:8.092198+0.090249    test-rmse:8.519850+0.275152 
## [425]    train-rmse:8.089833+0.089624    test-rmse:8.517859+0.275580 
## [426]    train-rmse:8.085585+0.089732    test-rmse:8.515005+0.276292 
## [427]    train-rmse:8.082400+0.089329    test-rmse:8.512342+0.276278 
## [428]    train-rmse:8.078796+0.089875    test-rmse:8.509256+0.276792 
## [429]    train-rmse:8.075849+0.089464    test-rmse:8.506932+0.277166 
## [430]    train-rmse:8.073041+0.087549    test-rmse:8.504521+0.277748 
## [431]    train-rmse:8.071242+0.087549    test-rmse:8.502811+0.277996 
## [432]    train-rmse:8.069029+0.086865    test-rmse:8.501328+0.279035 
## [433]    train-rmse:8.064213+0.087649    test-rmse:8.498113+0.279256 
## [434]    train-rmse:8.062222+0.087349    test-rmse:8.496344+0.279523 
## [435]    train-rmse:8.059366+0.086985    test-rmse:8.494391+0.280362 
## [436]    train-rmse:8.055438+0.086068    test-rmse:8.491416+0.279421 
## [437]    train-rmse:8.052108+0.086885    test-rmse:8.488386+0.278579 
## [438]    train-rmse:8.048850+0.085997    test-rmse:8.486083+0.278815 
## [439]    train-rmse:8.046628+0.086346    test-rmse:8.483849+0.277993 
## [440]    train-rmse:8.042889+0.086005    test-rmse:8.480684+0.277843 
## [441]    train-rmse:8.040497+0.086143    test-rmse:8.478938+0.277881 
## [442]    train-rmse:8.037734+0.085031    test-rmse:8.476723+0.277461 
## [443]    train-rmse:8.034520+0.084576    test-rmse:8.473696+0.276319 
## [444]    train-rmse:8.032463+0.084402    test-rmse:8.471935+0.275350 
## [445]    train-rmse:8.029952+0.084621    test-rmse:8.469428+0.273860 
## [446]    train-rmse:8.026661+0.085226    test-rmse:8.466218+0.274976 
## [447]    train-rmse:8.023922+0.084412    test-rmse:8.464393+0.275190 
## [448]    train-rmse:8.020094+0.082567    test-rmse:8.461365+0.274554 
## [449]    train-rmse:8.017666+0.082422    test-rmse:8.459771+0.273872 
## [450]    train-rmse:8.015571+0.081732    test-rmse:8.458643+0.272889 
## [451]    train-rmse:8.013494+0.081625    test-rmse:8.456833+0.271866 
## [452]    train-rmse:8.011926+0.081374    test-rmse:8.455317+0.271847 
## [453]    train-rmse:8.010041+0.081411    test-rmse:8.453983+0.272148 
## [454]    train-rmse:8.007793+0.081161    test-rmse:8.452064+0.271696 
## [455]    train-rmse:8.005199+0.080528    test-rmse:8.450239+0.272341 
## [456]    train-rmse:8.002216+0.080523    test-rmse:8.448045+0.272736 
## [457]    train-rmse:7.999707+0.079783    test-rmse:8.445560+0.273918 
## [458]    train-rmse:7.997831+0.079700    test-rmse:8.444458+0.274007 
## [459]    train-rmse:7.994603+0.080771    test-rmse:8.440866+0.273367 
## [460]    train-rmse:7.992879+0.080432    test-rmse:8.439574+0.273518 
## [461]    train-rmse:7.990083+0.080111    test-rmse:8.436619+0.273167 
## [462]    train-rmse:7.987834+0.080243    test-rmse:8.434950+0.272523 
## [463]    train-rmse:7.984344+0.079891    test-rmse:8.431353+0.273276 
## [464]    train-rmse:7.981525+0.079210    test-rmse:8.428841+0.273198 
## [465]    train-rmse:7.978375+0.080101    test-rmse:8.427250+0.273470 
## [466]    train-rmse:7.974630+0.080433    test-rmse:8.424097+0.274155 
## [467]    train-rmse:7.973415+0.080284    test-rmse:8.423350+0.274252 
## [468]    train-rmse:7.971295+0.080645    test-rmse:8.421641+0.275041 
## [469]    train-rmse:7.968706+0.080043    test-rmse:8.419375+0.275431 
## [470]    train-rmse:7.966567+0.079792    test-rmse:8.417928+0.275207 
## [471]    train-rmse:7.962298+0.078212    test-rmse:8.416155+0.275486 
## [472]    train-rmse:7.959742+0.078490    test-rmse:8.413780+0.276457 
## [473]    train-rmse:7.956085+0.080289    test-rmse:8.410702+0.275732 
## [474]    train-rmse:7.952344+0.080014    test-rmse:8.409303+0.275206 
## [475]    train-rmse:7.949541+0.079862    test-rmse:8.406817+0.275842 
## [476]    train-rmse:7.946075+0.078937    test-rmse:8.404160+0.276621 
## [477]    train-rmse:7.944195+0.079417    test-rmse:8.402513+0.275579 
## [478]    train-rmse:7.941987+0.079256    test-rmse:8.400459+0.274787 
## [479]    train-rmse:7.938580+0.080687    test-rmse:8.397437+0.274617 
## [480]    train-rmse:7.935722+0.082294    test-rmse:8.395603+0.274863 
## [481]    train-rmse:7.932792+0.081274    test-rmse:8.392947+0.274775 
## [482]    train-rmse:7.929390+0.080549    test-rmse:8.389311+0.271597 
## [483]    train-rmse:7.926961+0.081726    test-rmse:8.387283+0.271725 
## [484]    train-rmse:7.925202+0.081580    test-rmse:8.384492+0.272148 
## [485]    train-rmse:7.922907+0.081142    test-rmse:8.382499+0.272672 
## [486]    train-rmse:7.919974+0.080485    test-rmse:8.380681+0.272544 
## [487]    train-rmse:7.917350+0.079643    test-rmse:8.379838+0.272426 
## [488]    train-rmse:7.915029+0.080543    test-rmse:8.377723+0.272524 
## [489]    train-rmse:7.912765+0.080303    test-rmse:8.375401+0.272688 
## [490]    train-rmse:7.909595+0.079122    test-rmse:8.373326+0.269522 
## [491]    train-rmse:7.905735+0.077850    test-rmse:8.370394+0.268919 
## [492]    train-rmse:7.902571+0.078012    test-rmse:8.369038+0.268340 
## [493]    train-rmse:7.899653+0.078557    test-rmse:8.366764+0.268550 
## [494]    train-rmse:7.896992+0.079259    test-rmse:8.365347+0.269079 
## [495]    train-rmse:7.894135+0.079491    test-rmse:8.363141+0.268657 
## [496]    train-rmse:7.890822+0.078929    test-rmse:8.361437+0.268940 
## [497]    train-rmse:7.888291+0.079203    test-rmse:8.359362+0.268691 
## [498]    train-rmse:7.885632+0.079574    test-rmse:8.357145+0.268666 
## [499]    train-rmse:7.883213+0.079013    test-rmse:8.355656+0.269265 
## [500]    train-rmse:7.880386+0.077602    test-rmse:8.353502+0.268309
best_nrounds <- cv_results$best_iteration
# Train the final model using the best number of rounds found
model_xgb <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb, dtrain)
test_pred <- predict(model_xgb, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)

# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)

train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
cat("Model Performance Metrics:\n",
    "--------------------------\n",
    "Training RMSE: ", train_rmse, "\n",
    "Test RMSE: ", test_rmse, "\n",
    "Training R-squared: ", r_squared_train, "\n",
    "Test R-squared: ", r_squared_test, "\n",
    "Training MAE: ", train_mae, "\n",
    "Test MAE: ", test_mae, "\n",
    "Training MAPE: ", train_mape, "%\n",
    "Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 7.930191
## Test RMSE: 8.011783
## Training R-squared: 0.9902172
## Test R-squared: 0.9896139
## Training MAE: 4.898334
## Test MAE: 4.971412
## Training MAPE: 16.84768%
## Test MAPE: 17.11487%
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:

residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred

residuals_data <- data.frame(
  Residuals = c(residuals_train, residuals_test),
  Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)

# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
  geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
  facet_wrap(~ Dataset) +
  ggtitle('Residuals Distribution')

# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined

# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
  Actual = c(train_labels, test_labels),
  Predicted = c(train_pred, test_pred),
  Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)

# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
  xlab('Actual Values') +
  ylab('Predicted Values') +
  scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
  ggtitle('Actual vs. Predicted Values')

library(xgboost)

# Calculate feature importance
importance_matrix <- xgb.importance(feature_names = colnames(train_features), model = model_xgb)

# View the feature importance scores
print(importance_matrix)
##                                                                     Feature
##                                                                      <char>
##  1:                                                            DOLLAR_SALES
##  2:                                                                 RESERVE
##  3:                                                                   WHITE
##  4:                ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
##  5:                                                 BRAND_MYTHICAL BEVERAGE
##  6:                                                                POP_SQMI
##  7:                                                      WEEKS_SINCE_LAUNCH
##  8:                                                         REGION_COLORADO
##  9:                                                          REGION_PRAIRIE
## 10:                                                         CALORIC_SEGMENT
## 11:                                                           REGION_KANSAS
## 12:                                                         REGION_NORTHERN
## 13:                                                                  JUICED
## 14:                                                                  SUNSET
## 15:                                                         REGION_MOUNTAIN
## 16:                                                       16SMALL MULTI CUP
## 17:                                                                  PITAYA
## 18:                                                            WEEK_OF_YEAR
## 19:                                                      BRAND_DIET MOONLIT
## 20:                                                            REGION_NOCAL
## 21:                                                                  CASAVA
## 22:                                                          REGION_ARIZONA
## 23:                                                            REGION_SOCAL
## 24:                                        ITEM_MOONLIT GENTLE DRINK CASAVA
## 25:                                                                    JACK
## 26:                                                                   GUAVA
## 27:                                                                      ED
## 28:                      ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 29:                                                                RECOVERY
## 30:                                                      REGION_CALI_NEVADA
## 31:                                                        REGION_NEWMEXICO
## 32:                                                        REGION_DESERT_SW
## 33:                ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 34: ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 35:                                                            2L MULTI JUG
## 36:                                                           BRAND_MOONLIT
## 37:                                                       20SMALL MULTI JUG
## 38:                                                BRAND_SUPER-DUPER JUICED
## 39:                                               BRAND_SUPER-DUPER PUNCHED
## 40:                                                       16SMALL 24ONE CUP
## 41:                                              BRAND_SUPER-DUPER RECOVERY
## 42:                                        ITEM_MOONLIT GENTLE DRINK SUNSET
## 43:                               ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK
##                                                                     Feature
##             Gain        Cover    Frequency
##            <num>        <num>        <num>
##  1: 6.949449e-01 2.983306e-01 0.2036831903
##  2: 9.170854e-02 5.823785e-02 0.0438936583
##  3: 5.057352e-02 2.533318e-02 0.0229853226
##  4: 4.039029e-02 1.272669e-02 0.0090002769
##  5: 2.621033e-02 4.603257e-03 0.0027693160
##  6: 2.470054e-02 1.391316e-01 0.1545278316
##  7: 1.832191e-02 1.600026e-01 0.1741899751
##  8: 1.066675e-02 1.031751e-02 0.0232622542
##  9: 9.336688e-03 8.162987e-03 0.0174466907
## 10: 5.512280e-03 1.655020e-02 0.0281085572
## 11: 5.471806e-03 5.639427e-03 0.0119080587
## 12: 3.451857e-03 2.132392e-02 0.0279700914
## 13: 2.836582e-03 5.393177e-03 0.0168928275
## 14: 2.720214e-03 1.429611e-02 0.0221545278
## 15: 2.172500e-03 1.917322e-02 0.0180005539
## 16: 1.341769e-03 3.251368e-03 0.0085848795
## 17: 1.149949e-03 1.944769e-02 0.0113541955
## 18: 1.020890e-03 7.335864e-02 0.0697867627
## 19: 9.891990e-04 3.840291e-04 0.0001384658
## 20: 9.395797e-04 5.570445e-03 0.0112157297
## 21: 8.923480e-04 6.391102e-03 0.0088618111
## 22: 8.460856e-04 9.899813e-03 0.0222929936
## 23: 6.064193e-04 9.033504e-03 0.0135696483
## 24: 5.770397e-04 1.139353e-02 0.0058155636
## 25: 4.419043e-04 2.750476e-03 0.0036001108
## 26: 3.851232e-04 2.528639e-03 0.0062309610
## 27: 3.406668e-04 9.961911e-03 0.0090002769
## 28: 2.914207e-04 1.380000e-03 0.0022154528
## 29: 2.426000e-04 2.632903e-03 0.0081694821
## 30: 2.347748e-04 2.780865e-03 0.0029077818
## 31: 2.097458e-04 1.697147e-02 0.0120465245
## 32: 1.189504e-04 1.109921e-03 0.0062309610
## 33: 9.019196e-05 1.155648e-03 0.0024923844
## 34: 7.054915e-05 7.105951e-03 0.0040155082
## 35: 6.052376e-05 1.444501e-03 0.0042924398
## 36: 3.529712e-05 3.784081e-03 0.0024923844
## 37: 3.284093e-05 3.041507e-03 0.0020769870
## 38: 2.235556e-05 4.554642e-04 0.0011077264
## 39: 1.761605e-05 2.234395e-03 0.0013846580
## 40: 1.074121e-05 1.831866e-03 0.0016615896
## 41: 7.170666e-06 2.749965e-04 0.0004153974
## 42: 5.514735e-06 5.786144e-04 0.0011077264
## 43: 3.466779e-08 2.433873e-05 0.0001384658
##             Gain        Cover    Frequency
# Plot the feature importance
xgb.plot.importance(importance_matrix = importance_matrix)

# Compute partial dependence data for 'DOLLAR_SALES' and 'CASAVA', CALORIC_SEGMENT, and "ENERGY
# pd <- partial(model_xgb, pred.var = c("DOLLAR_SALES", "CASAVA", "CALORIC_SEGMENT", ENERGY"), train = train_features, grid.resolution = 20)
# 
# # Default PDP
# pdp1 <- plotPartial(pd, plot = TRUE)
# 
# # Add contour lines and use a different color palette
# rwb <- colorRampPalette(c("red", "white", "blue"))
# pdp2 <- plotPartial(pd, contour = TRUE, col.regions = rwb)
# 
# # 3-D surface
# pdp3 <- plotPartial(pd, levelplot = FALSE, zlab = "Predicted Outcome", drape = TRUE, colorkey = TRUE, screen = list(z = -20, x = -60))
# 
# # Combine plots into one window
# grid.arrange(pdp1, pdp2, pdp3, ncol = 3)

XGBOOST Model #2

Model with NO DOLLAR SALES Variable

# Assuming 'df' is your complete dataframe and 'UNIT_SALES' is your target variable
df2 <- df
# Remove DOLLAR_SALES from the features
df2$DOLLAR_SALES <- NULL

# Split the updated data into training and testing sets (assuming you're using a similar approach as before)
set.seed(123)
df2_testtrn <- initial_split(df2, prop = 0.8, strata = UNIT_SALES)
Train <- training(df2_testtrn)
Test <- testing(df2_testtrn)

# Prepare features and labels for XGBoost, excluding DOLLAR_SALES
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES

# Convert data to DMatrix format for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Assuming 'params' and 'best_nrounds' are defined as before

# Train the final model without DOLLAR_SALES
model_xgb_no_dollar_sales <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb_no_dollar_sales, dtrain)
test_pred <- predict(model_xgb_no_dollar_sales, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)

# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)

train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:

residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred

residuals_data <- data.frame(
  Residuals = c(residuals_train, residuals_test),
  Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)

# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
  geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
  facet_wrap(~ Dataset) +
  ggtitle('Residuals Distribution')

# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined

# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
  Actual = c(train_labels, test_labels),
  Predicted = c(train_pred, test_pred),
  Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)

# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
  xlab('Actual Values') +
  ylab('Predicted Values') +
  scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
  ggtitle('Actual vs. Predicted Values')

cat("Model Performance Metrics:\n",
    "--------------------------\n",
    "Training RMSE: ", train_rmse, "\n",
    "Test RMSE: ", test_rmse, "\n",
    "Training R-squared: ", r_squared_train, "\n",
    "Test R-squared: ", r_squared_test, "\n",
    "Training MAE: ", train_mae, "\n",
    "Test MAE: ", test_mae, "\n",
    "Training MAPE: ", train_mape, "%\n",
    "Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 39.07454
## Test RMSE: 39.25329
## Training R-squared: 0.7624885
## Test R-squared: 0.7506866
## Training MAE: 26.52904
## Test MAE: 26.70213
## Training MAPE: 157.1858%
## Test MAPE: 159.6215%
# Calculate feature importance
importance_matrix2 <- xgb.importance(feature_names = colnames(train_features), model = model_xgb_no_dollar_sales)

# View the feature importance scores
print(importance_matrix2)
##                                                                     Feature
##                                                                      <char>
##  1:                                                                 RESERVE
##  2:                                                                POP_SQMI
##  3:                                                                   WHITE
##  4:                                                      WEEKS_SINCE_LAUNCH
##  5:                                                          REGION_PRAIRIE
##  6:                                                         REGION_COLORADO
##  7:                                                         CALORIC_SEGMENT
##  8:                                                           REGION_KANSAS
##  9:                                                         REGION_NORTHERN
## 10:                                                       16SMALL MULTI CUP
## 11:                                                         REGION_MOUNTAIN
## 12:                                                                  SUNSET
## 13:                                                 BRAND_MYTHICAL BEVERAGE
## 14:                                                                  JUICED
## 15:                                                                  CASAVA
## 16:                                                            REGION_NOCAL
## 17:                ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 18:                                                          REGION_ARIZONA
## 19:                                                            WEEK_OF_YEAR
## 20:                                                                   GUAVA
## 21:                                        ITEM_MOONLIT GENTLE DRINK SUNSET
## 22:                                                                RECOVERY
## 23:                                                        REGION_DESERT_SW
## 24:                                                            REGION_SOCAL
## 25:                                                                      ED
## 26:                                                        REGION_NEWMEXICO
## 27:                                                            2L MULTI JUG
## 28:                                                       16SMALL 24ONE CUP
## 29:                                                                  PITAYA
## 30:                                                      REGION_CALI_NEVADA
## 31:                                                      BRAND_DIET MOONLIT
## 32:                                        ITEM_MOONLIT GENTLE DRINK CASAVA
## 33:                                                                    JACK
## 34:                      ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 35: ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 36:                                              BRAND_SUPER-DUPER RECOVERY
## 37:                                               BRAND_SUPER-DUPER PUNCHED
## 38:                                                           BRAND_MOONLIT
## 39:                ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 40:                                                       20SMALL MULTI JUG
## 41:                                                BRAND_SUPER-DUPER JUICED
##                                                                     Feature
##             Gain        Cover    Frequency
##            <num>        <num>        <num>
##  1: 2.743479e-01 0.0311422969 0.0603201830
##  2: 2.300257e-01 0.4877309541 0.3480560320
##  3: 1.406387e-01 0.0139118842 0.0313036021
##  4: 6.084740e-02 0.1241158505 0.1262149800
##  5: 3.380574e-02 0.0132583230 0.0125786164
##  6: 3.060451e-02 0.0154702145 0.0224413951
##  7: 2.879702e-02 0.0149826183 0.0328759291
##  8: 2.873261e-02 0.0186260407 0.0208690680
##  9: 2.049425e-02 0.0162880098 0.0305889079
## 10: 1.891476e-02 0.0056386017 0.0210120069
## 11: 1.707243e-02 0.0292835808 0.0267295597
## 12: 1.353421e-02 0.0112822221 0.0168667810
## 13: 1.157059e-02 0.0009471601 0.0020011435
## 14: 1.125267e-02 0.0084123235 0.0154373928
## 15: 1.047061e-02 0.0054694362 0.0081475129
## 16: 1.006236e-02 0.0097555855 0.0124356775
## 17: 7.249893e-03 0.0043539325 0.0088622070
## 18: 6.108128e-03 0.0144459561 0.0180102916
## 19: 5.298023e-03 0.0365974367 0.0530303030
## 20: 5.183412e-03 0.0030446432 0.0081475129
## 21: 3.324229e-03 0.0029035100 0.0045740423
## 22: 3.296834e-03 0.0035136964 0.0107204117
## 23: 2.983453e-03 0.0135743502 0.0110062893
## 24: 2.786177e-03 0.0046931602 0.0092910234
## 25: 2.774711e-03 0.0040798356 0.0084333905
## 26: 2.728908e-03 0.0100883374 0.0114351058
## 27: 2.600210e-03 0.0010330629 0.0050028588
## 28: 1.952294e-03 0.0344090695 0.0105774728
## 29: 1.856985e-03 0.0188922547 0.0131503716
## 30: 1.778773e-03 0.0118920100 0.0071469411
## 31: 1.714332e-03 0.0006678825 0.0011435106
## 32: 1.459442e-03 0.0063401330 0.0042881647
## 33: 1.319734e-03 0.0049322759 0.0074328188
## 34: 1.309078e-03 0.0008901489 0.0025728988
## 35: 8.565183e-04 0.0073977413 0.0052887364
## 36: 7.154599e-04 0.0011473592 0.0017152659
## 37: 4.878142e-04 0.0039123637 0.0030017153
## 38: 4.269584e-04 0.0021826761 0.0018582047
## 39: 3.633528e-04 0.0015089903 0.0021440823
## 40: 1.625356e-04 0.0006477580 0.0024299600
## 41: 9.124767e-05 0.0005363135 0.0008576329
##             Gain        Cover    Frequency
xgb.plot.importance(importance_matrix = importance_matrix2)

if (!requireNamespace("pdp", quietly = TRUE)) install.packages("pdp")
if (!requireNamespace("xgboost", quietly = TRUE)) install.packages("xgboost")
library(pdp)
library(xgboost)
pdp::partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features)
##    WEEK_OF_YEAR     yhat
## 1          1.00 72.15863
## 2          2.04 72.16389
## 3          3.08 70.88787
## 4          4.12 70.90670
## 5          5.16 71.13975
## 6          6.20 71.17511
## 7          7.24 71.18012
## 8          8.28 70.85040
## 9          9.32 70.81431
## 10        10.36 70.95667
## 11        11.40 71.07068
## 12        12.44 71.07678
## 13        13.48 71.11003
## 14        14.52 71.31220
## 15        15.56 71.27513
## 16        16.60 71.61472
## 17        17.64 71.48038
## 18        18.68 71.62180
## 19        19.72 71.82469
## 20        20.76 71.93246
## 21        21.80 71.90557
## 22        22.84 71.77916
## 23        23.88 71.66963
## 24        24.92 71.84245
## 25        25.96 71.74708
## 26        27.00 71.67656
## 27        28.04 71.75623
## 28        29.08 71.58050
## 29        30.12 70.44575
## 30        31.16 70.44596
## 31        32.20 70.84234
## 32        33.24 70.97563
## 33        34.28 70.45476
## 34        35.32 70.46765
## 35        36.36 70.72899
## 36        37.40 71.29672
## 37        38.44 71.25609
## 38        39.48 71.09182
## 39        40.52 70.83648
## 40        41.56 70.75148
## 41        42.60 71.28568
## 42        43.64 71.26201
## 43        44.68 71.09134
## 44        45.72 71.08036
## 45        46.76 71.00827
## 46        47.80 71.92899
## 47        48.84 73.60783
## 48        49.88 74.22794
## 49        50.92 74.69990
## 50        51.96 75.44216
## 51        53.00 74.99059
pd <- partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features, grid.resolution = 20)

# Default PDP
pdp1 <- plotPartial(pd, plot = TRUE)



# plot
grid.arrange(pdp1)

Based on the Casava Energy Drink innovation datafram we expect the best 6 months to be between about weeks 20 and weeks 41. Form looking at Plum, we know that the dummy data predictions do not work well with the data and XGBoost.

#cleanup all obj
rm(list = ls())

Stochastic Weighted Average model for Casava Energy Drink, this dumps out the predicted demand. We then used Excel to calculate the optimal order quantity based on the predicted demand and the cost of overstocking and understocking using a simple NewsVendor Model. All visualtions were done in Tabelau.

df <- read_csv("casava_tableau.csv")  #inject the df and we will sub-sample
## Rows: 72455 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM, REGION, SEASON, PACK...
## dbl  (7): MARKET_KEY, CALORIC_SEGMENT, UNIT_SALES, DOLLAR_SALES, POP_SQMI, M...
## date (2): DATE, min_launch_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 17
##   MARKET_KEY DATE       CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
##        <dbl> <date>               <dbl> <chr>         <dbl>        <dbl>
## 1          1 2022-09-17               1 ENERGY           21         38.5
## 2          1 2021-09-18               1 ENERGY           27         45.0
## 3          1 2022-11-05               1 ENERGY           33         91.6
## 4          1 2023-04-29               1 ENERGY           54         97.0
## 5          1 2023-04-01               1 ENERGY           23         56.8
## 6          1 2023-04-29               1 ENERGY           24         53.5
## # ℹ 11 more variables: MANUFACTURER <chr>, BRAND <chr>, PACKAGE <chr>,
## #   ITEM <chr>, POP_SQMI <dbl>, REGION <chr>, MONTH <dbl>, SEASON <chr>,
## #   PACKAGE2 <chr>, min_launch_date <date>, WEEKS_SINCE_LAUNCH <dbl>
#filter package by "16SMALL MULTI CUP" and filter out non super-duper
df <- df %>%
  filter(PACKAGE == "16SMALL MULTI CUP",
        # str_detect(ITEM,"SUPER-DUPER|MYTHICAL"))
        str_detect(ITEM,"SUPER-DUPER"))

print(unique(df$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"               
## [2] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"                     
## [3] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
#SUM of Sales of "SUPER-DUPER JUICED  CASAVA SUNSET GUAVA", "SUPER-DUPER PITAYA  CASAVA", "SUPER-DUPER RECOVERY  CASAVA JACK" by WEEKS_SINCE_LAUNCH keep DATE column. Average super duper sales and take ratio of Diet Moonlit to Regular 
df <- df %>%
  group_by(WEEKS_SINCE_LAUNCH) %>%
  summarise(INNOVATION_UNIT_SALES = (sum(UNIT_SALES)) * 0.5/3) %>%
  ungroup()  # This ungroups the df after summarizing, making it easier to work with

# Now you can select all three columns: WEEKS_SINCE_LAUNCH, SUM_UNIT_SALES, and DATE
df <- df %>%
  select(WEEKS_SINCE_LAUNCH, INNOVATION_UNIT_SALES)
# #Convert WEEKS_SINCE_LAUNCH to a DATE starting at week 20 of 2021
# df$DATE <- as.Date("2021-05-17") + (df$WEEKS_SINCE_LAUNCH - 1) * 7
# df <- df %>%
#   select(DATE, INNOVATION_UNIT_SALES)
# 
# head(df)
df
## # A tibble: 109 × 2
##    WEEKS_SINCE_LAUNCH INNOVATION_UNIT_SALES
##                 <dbl>                 <dbl>
##  1                  0                  10.7
##  2                  1                 151. 
##  3                  2                 760. 
##  4                  3                1670. 
##  5                  4                2689. 
##  6                  5                3553  
##  7                  6                4628. 
##  8                  7                5804. 
##  9                  8                5802. 
## 10                  9                4768. 
## # ℹ 99 more rows
write_csv(df,"cassava_newsvendor.csv")